require(RLDNe)
require(car)
require(DHARMa)
require(emmeans)
require(MASS)
require(effects)
require(glmmTMB)
require(lme4)
require(kableExtra)
require(gt)
require(gtsummary)
require(tidyverse)
require(magrittr)
require(countreg)
require(lmerTest)

Summary

This notebook contains a log of all analyses for the 2022 North Santiam Chinook salmon genetic pedigree study. Inference of the pedigree used here is conducted in a separate notebook titled “parentage_assignment_log” in this same repository.

Goals
(1) Summarise assignments
(2) Estimate total lifetime fitness (TLF) for parent cohorts
(3) Estimate cohort replacement rates for parent cohorts
(4) Assess variables that influence fitness with general linear models
(5) Estimate effective number of breeders using NeEstimator

This is an R notebook. The .html version of this file is a fully rendered and interactive log. To view it, save the html and open in a browse. The .rmd version can be opened within R studio. To reproduce results or edit the analysis: clone the full repository onto tyour local machine and open the r project in rstudio. This will provide all needed data and objects.

Data

Import

Here we import metadata and the final pedigree.

load("../parentage/parentage_results/meta_data.R")
load("../parentage/parentage_results/pedigree.R")

# let's get the metadata on the pedigree
pedigree_meta <- meta_data %>%
  select(sample_id, year, type, date) %>%
  rename_with(.fn = ~ paste0("offspring_", .x)) %>%
  right_join(pedigree, by = c("offspring_sample_id" = "offspring_sample_id"))

#father
pedigree_meta <- meta_data %>%
  select(sample_id, year, type, date, above_below) %>%
  rename_with(.fn = ~ paste0("father_", .x)) %>%
  right_join(pedigree_meta, by = c("father_sample_id" = "father")) %>%
  rename(father = father_sample_id)

#mother
pedigree_meta <- meta_data %>%
  select(sample_id, year, type, date, above_below) %>%
  rename_with(.fn = ~ paste0("mother_", .x)) %>%
  right_join(pedigree_meta, by = c("mother_sample_id" = "mother")) %>%
  rename(mother = mother_sample_id)

Dataset Summary

Here we summarize the metadata of salmon that are part of the final filtered dataset used as candidate parents and offspring.

tbl_summary(select(meta_data, type, year), by = type, percent = "row") %>%
  modify_header(label ~ "") %>%
  modify_spanning_header(c("stat_1", "stat_2", "stat_3", "stat_4", "stat_5") ~ "**type**") %>%
  as_kable_extra() %>%
  kable_classic(full_width = F, html_font = "Cambria")# %>%
type
carcass, N = 427 carcass_above, N = 15 outplant, N = 6,349 reintro, N = 5,135 reintro_above, N = 431
year
2011 197 (57%) 0 (0%) 149 (43%) 0 (0%) 0 (0%)
2012 84 (25%) 0 (0%) 258 (75%) 0 (0%) 0 (0%)
2013 39 (2.3%) 0 (0%) 1,125 (65%) 554 (32%) 0 (0%)
2014 46 (2.8%) 0 (0%) 861 (52%) 754 (45%) 0 (0%)
2015 19 (1.1%) 15 (0.9%) 1,042 (63%) 148 (8.9%) 431 (26%)
2016 19 (1.0%) 0 (0%) 1,310 (71%) 520 (28%) 0 (0%)
2017 14 (0.7%) 0 (0%) 1,604 (76%) 505 (24%) 0 (0%)
2018 3 (1.2%) 0 (0%) 0 (0%) 248 (99%) 0 (0%)
2019 6 (0.7%) 0 (0%) 0 (0%) 813 (99%) 0 (0%)
2020 0 (0%) 0 (0%) 0 (0%) 1,593 (100%) 0 (0%)
1 n (%)

The dataset described above includes all sampled parents from 2011 - 2017 and all sampled offspring from 2014 - 2020, after filtering for missingness and duplicates.

Note: Some of the results we work with in this report are properties of parents, and others are properties of offspring. Throughout the report we include results for parent years 2011-2015, and offspring years for 2016-2020. The pedigree is complete (full age structure present in dataset) for parent years 2011-2015 and for offspring years 2016-2020.
For example: Fitness is a property of parents and we report fitness for parent years 2011-2015, so it requires information from additional offspring years that are not part of the range of years for which we report offspring level data (2014 and 2015).
As another example, while 2014 offspring are included in the pedigree, the complete set of their potential parents are not (2009, 2010). This is because we do not include results for parents years 2009 or 2010, but we do include 2011. Similarly 2017 parents are included in the study because they contribute as parents to 2020 offspring year, but we do not sample their potential age 4 and age 5 offspring, so we do not report full 2017 parent level results.

Releases

Let’s summarise outplant release locations by location and date to present in table 2. Only present 2016 and 2017 because table 2 already complete for 2011 - 2015

kable(meta_data %>%
  filter(above_below == "above") %>%
    filter(year > 2015) %>%
  count(year, date, location), align = "c", caption = "Table 2 from draft report totals") %>%
  kable_classic(full_width = F, html_font = "Cambria") 
Table 2 from draft report totals
year date location n
2016 NA breitenbush river 1253
2016 NA horn creek 57
2017 2017-06-27 log deck 175
2017 2017-07-06 breitenbush river 167
2017 2017-07-10 log deck 155
2017 2017-07-13 breitenbush river 1
2017 2017-07-13 horn creek 65
2017 2017-07-17 log deck 153
2017 2017-07-26 log deck 142
2017 2017-08-08 breitenbush river 76
2017 2017-08-14 log deck 57
2017 2017-08-22 breitenbush river 68
2017 2017-08-31 log deck 165
2017 2017-09-13 log deck 103
2017 2017-09-18 breitenbush river 277
kable(meta_data %>%
  filter(above_below == "above", geno_sex == "F") %>%
    filter(year > 2015) %>%
  count(year, date, location), align = "c", caption = "Table 2 from draft report females") %>%
  kable_classic(full_width = F, html_font = "Cambria") 
Table 2 from draft report females
year date location n
2016 NA breitenbush river 830
2016 NA horn creek 28
2017 2017-06-27 log deck 89
2017 2017-07-06 breitenbush river 59
2017 2017-07-10 log deck 66
2017 2017-07-13 horn creek 30
2017 2017-07-17 log deck 71
2017 2017-07-26 log deck 74
2017 2017-08-08 breitenbush river 26
2017 2017-08-14 log deck 25
2017 2017-08-22 breitenbush river 24
2017 2017-08-31 log deck 72
2017 2017-09-13 log deck 23
2017 2017-09-18 breitenbush river 132
kable(meta_data %>%
  filter(above_below == "above", geno_sex == "M") %>%
    filter(year > 2015) %>%
  count(year, date, location), align = "c", caption = "Table 2 from draft report males") %>%
  kable_classic(full_width = F, html_font = "Cambria")
Table 2 from draft report males
year date location n
2016 NA breitenbush river 423
2016 NA horn creek 29
2017 2017-06-27 log deck 86
2017 2017-07-06 breitenbush river 108
2017 2017-07-10 log deck 89
2017 2017-07-13 breitenbush river 1
2017 2017-07-13 horn creek 35
2017 2017-07-17 log deck 82
2017 2017-07-26 log deck 68
2017 2017-08-08 breitenbush river 50
2017 2017-08-14 log deck 32
2017 2017-08-22 breitenbush river 44
2017 2017-08-31 log deck 93
2017 2017-09-13 log deck 80
2017 2017-09-18 breitenbush river 145

Assignment Rates

This section calculates number of assignments between different parent and offspring classes.

Table 3

Here we present the assignment rates into the format of table 3 from the draft report

Each table in the notebook below can be used to recreate table 3. Each table represents a single pair of offspring and parent years. The type of offspring is listed in the first column and the results are split between parent types along the remaining columns (e.g. same format as table 3)

When the “type” of parent varies between two parents, the female parent type is listed first. This is slightly different from table 3, so close attention is required when doing data entry here. For example, if an offspring is assigned to a reintroduced mother and carcass father, the column would be called “reintro/carcass.”

# the format of the table is difficult to produce in r
# let's not change the format of the table, instead we'll write some helper functions for filling it out 

#let's add a column for the type of assignment, and one for combined types
pedigree_meta %<>%
  mutate(assn_type = case_when((mother == "none" & father == "none") ~ "none",
                               (mother == "none" & father != "none") ~ "male_only",
                               (mother != "none" & father == "none") ~ "female_only",
                               (mother != "none" & father != "none") ~ "pair",)) %>%
  mutate(parent_type = case_when((father_type == mother_type) ~ father_type,
                                 (is.na(father_type) & !(is.na(mother_type))) ~ mother_type,
                                  (is.na(mother_type) & !(is.na(father_type))) ~ father_type,
                                   (father_type != mother_type) ~ paste(mother_type, father_type, sep = "/")))

# function
t4_helper <- function(p_year, off_year){pedigree_meta %>%
  filter(offspring_year == off_year) %>%
  mutate(parent_year = (coalesce(father_year, mother_year))) %>%
  filter(parent_year == p_year) %>%
  mutate(parent_type = as.factor(parent_type)) %>%
  select(offspring_type, parent_type, assn_type) %>%
  tbl_strata(
    strata = parent_type,
    .tbl_fun = ~ .x %>%
      tbl_summary( by = assn_type, percent = NULL)
  ) %>%
  modify_caption(paste(paste("parent year: ", p_year), paste("offspring year: ", off_year), sep = "  ,")) %>%
  as_kable_extra() %>%
  kable_classic(full_width = F, html_font = "Cambria")
}

Offspring Year 2016

#t4 helper function example
t4_helper("2011", "2016")
parent year: 2011 ,offspring year: 2016
carcass
outplant
Characteristic female_only, N = 4 pair, N = 1 female_only, N = 7 male_only, N = 4 pair, N = 32
offspring_type
carcass 1 (25%) 0 (0%)
reintro 3 (75%) 1 (100%) 7 (100%) 4 (100%) 32 (100%)
1 n (%)
t4_helper("2012", "2016")
parent year: 2012 ,offspring year: 2016
carcass
outplant
Characteristic female_only, N = 7 male_only, N = 7 female_only, N = 14 male_only, N = 3 pair, N = 69
offspring_type
carcass 1 (14%) 1 (14%)
reintro 6 (86%) 6 (86%) 14 (100%) 3 (100%) 69 (100%)
1 n (%)
t4_helper("2013", "2016")
parent year: 2013 ,offspring year: 2016
carcass
outplant
reintro
Characteristic female_only, N = 1 male_only, N = 1 female_only, N = 7 male_only, N = 12 pair, N = 5 female_only, N = 8 male_only, N = 6 pair, N = 3
offspring_type
carcass 1 (100%) 0 (0%) 0 (0%) 1 (8.3%) 0 (0%) 0 (0%) 1 (17%) 0 (0%)
reintro 0 (0%) 1 (100%) 7 (100%) 11 (92%) 5 (100%) 8 (100%) 5 (83%) 3 (100%)
1 n (%)

Offspring Year 2017

t4_helper("2012", "2017")
parent year: 2012 ,offspring year: 2017
carcass
outplant
Characteristic female_only, N = 2 male_only, N = 4 pair, N = 1 female_only, N = 10 male_only, N = 2 pair, N = 66
offspring_type
reintro 2 (100%) 4 (100%) 1 (100%) 10 (100%) 1 (50%) 66 (100%)
carcass 0 (0%) 1 (50%) 0 (0%)
1 n (%)
t4_helper("2013", "2017")
parent year: 2013 ,offspring year: 2017
carcass/reintro
outplant
reintro
Characteristic pair, N = 1 female_only, N = 11 male_only, N = 13 pair, N = 148 female_only, N = 7 male_only, N = 17 pair, N = 37
offspring_type
reintro 1 (100%) 11 (100%) 12 (92%) 147 (99%) 7 (100%) 16 (94%) 37 (100%)
carcass 0 (0%) 1 (7.7%) 1 (0.7%) 0 (0%) 1 (5.9%) 0 (0%)
1 n (%)
t4_helper("2014", "2017")
parent year: 2014 ,offspring year: 2017
carcass
outplant
reintro
Characteristic female_only, N = 1 male_only, N = 1 female_only, N = 3 male_only, N = 2 pair, N = 2 female_only, N = 3 male_only, N = 4 pair, N = 8
offspring_type
reintro 1 (100%) 1 (100%) 3 (100%) 2 (100%) 2 (100%) 3 (100%) 3 (75%) 8 (100%)
carcass 0 (0%) 1 (25%) 0 (0%)
1 n (%)

Offspring Year 2018

t4_helper("2013", "2018")
parent year: 2013 ,offspring year: 2018
outplant
reintro
Characteristic female_only, N = 2 male_only, N = 2 pair, N = 42 female_only, N = 2 male_only, N = 2 pair, N = 8
offspring_type
reintro 2 (100%) 2 (100%) 42 (100%) 2 (100%) 2 (100%) 8 (100%)
1 n (%)
t4_helper("2014", "2018")
parent year: 2014 ,offspring year: 2018
outplant
reintro
Characteristic female_only, N = 2 male_only, N = 4 pair, N = 38 female_only, N = 8 male_only, N = 6 pair, N = 11
offspring_type
carcass 0 (0%) 1 (25%) 0 (0%)
reintro 2 (100%) 3 (75%) 38 (100%) 8 (100%) 6 (100%) 11 (100%)
1 n (%)
t4_helper("2015", "2018")
parent year: 2015 ,offspring year: 2018
carcass
outplant
outplant/reintro_above
reintro
reintro_above
reintro_above/outplant
Characteristic male_only, N = 1 female_only, N = 3 male_only, N = 4 pair, N = 9 pair, N = 3 female_only, N = 3 male_only, N = 5 pair, N = 14 female_only, N = 6 male_only, N = 2 pair, N = 2 pair, N = 1
offspring_type
carcass 1 (100%)
reintro 3 (100%) 4 (100%) 9 (100%) 3 (100%) 3 (100%) 5 (100%) 14 (100%) 6 (100%) 2 (100%) 2 (100%) 1 (100%)
1 n (%)

Offspring Year 2019

t4_helper("2014", "2019")
parent year: 2014 ,offspring year: 2019
outplant
reintro
Characteristic female_only, N = 2 male_only, N = 10 pair, N = 24 female_only, N = 3 male_only, N = 4 pair, N = 3
offspring_type
reintro 2 (100%) 10 (100%) 24 (100%) 3 (100%) 4 (100%) 3 (100%)
1 n (%)
t4_helper("2015", "2019")
parent year: 2015 ,offspring year: 2019
carcass
carcass_above
carcass_above/outplant
outplant
outplant/reintro_above
reintro
reintro_above
reintro_above/carcass_above
reintro_above/outplant
reintro/carcass
Characteristic female_only, N = 1 male_only, N = 2 pair, N = 1 female_only, N = 29 male_only, N = 21 pair, N = 157 pair, N = 102 female_only, N = 19 male_only, N = 10 pair, N = 26 female_only, N = 26 male_only, N = 30 pair, N = 90 pair, N = 1 pair, N = 17 pair, N = 1
offspring_type
reintro 1 (100%) 2 (100%) 1 (100%) 28 (97%) 20 (95%) 157 (100%) 102 (100%) 18 (95%) 10 (100%) 26 (100%) 26 (100%) 30 (100%) 90 (100%) 1 (100%) 17 (100%) 1 (100%)
carcass 1 (3.4%) 1 (4.8%) 0 (0%) 1 (5.3%) 0 (0%) 0 (0%)
1 n (%)
t4_helper("2016", "2019")
parent year: 2016 ,offspring year: 2019
carcass
outplant
reintro
Characteristic male_only, N = 1 female_only, N = 7 male_only, N = 10 pair, N = 56 female_only, N = 4 male_only, N = 3 pair, N = 5
offspring_type
reintro 1 (100%) 7 (100%) 10 (100%) 56 (100%) 4 (100%) 3 (100%) 5 (100%)
1 n (%)

Offspring Year 2020

t4_helper("2015", "2020")
parent year: 2015 ,offspring year: 2020
outplant
outplant/reintro_above
reintro
reintro_above
reintro_above/carcass_above
reintro_above/outplant
Characteristic female_only, N = 21 male_only, N = 19 pair, N = 55 pair, N = 48 female_only, N = 1 male_only, N = 3 pair, N = 1 female_only, N = 13 male_only, N = 11 pair, N = 34 pair, N = 2 pair, N = 8
offspring_type
reintro 21 (100%) 19 (100%) 55 (100%) 48 (100%) 1 (100%) 3 (100%) 1 (100%) 13 (100%) 11 (100%) 34 (100%) 2 (100%) 8 (100%)
1 n (%)
t4_helper("2016", "2020")
parent year: 2016 ,offspring year: 2020
carcass
outplant
reintro
Characteristic female_only, N = 3 male_only, N = 1 female_only, N = 76 male_only, N = 44 pair, N = 981 female_only, N = 22 male_only, N = 17 pair, N = 67
offspring_type
reintro 3 (100%) 1 (100%) 76 (100%) 44 (100%) 981 (100%) 22 (100%) 17 (100%) 67 (100%)
1 n (%)
t4_helper("2017", "2020")
parent year: 2017 ,offspring year: 2020
outplant
reintro
Characteristic female_only, N = 1 male_only, N = 1 pair, N = 7 female_only, N = 3 male_only, N = 3 pair, N = 7
offspring_type
reintro 1 (100%) 1 (100%) 7 (100%) 3 (100%) 3 (100%) 7 (100%)
1 n (%)

Age at Maturity

We can also calculate age at maturity for offspring year using these data. These results appear in the draft report in the text, but we will summarise here in a table and a figure

# report text is formatted as number and percent per age for each offspring year

kable(pedigree_meta %>%
        filter(!(offspring_year %in% c("2014", "2015"))) %>%
  mutate(parent_year = (coalesce(father_year, mother_year))) %>%
  filter(!(is.na(parent_year))) %>%
  mutate(age = as.numeric(offspring_year) - as.numeric(parent_year)) %>%
  group_by(offspring_year, age) %>%
  summarise(n = n()) %>%
  mutate(percent = 100*(n/sum(n))), digits = 0) %>%
  kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(fixed_thead = T) %>%
  collapse_rows(columns = 1)
offspring_year age n percent
2016 3 43 23
2016 4 100 52
2016 5 48 25
2017 3 24 7
2017 4 234 68
2017 5 85 25
2018 3 53 29
2018 4 69 38
2018 5 58 32
2019 3 86 13
2019 4 533 80
2019 5 46 7
2020 3 22 2
2020 4 1211 84
2020 5 216 15
aam_data <- pedigree_meta %>%
        filter(!(offspring_year %in% c("2014", "2015"))) %>%
  mutate(parent_year = (coalesce(father_year, mother_year))) %>%
  filter(!(is.na(parent_year))) %>%
  mutate(age = as.numeric(offspring_year) - as.numeric(parent_year)) %>%
  group_by(offspring_year, age) %>%
  summarise(n = n()) %>%
  mutate(percent = 100*(n/sum(n)))

ggplot(data = aam_data)+geom_bar(aes(x = offspring_year, color = as.factor(age), fill = as.factor(age), y = n), stat = "identity", position = "dodge")+scale_fill_viridis_d(name = "Age")+scale_colour_viridis_d(name = "Age")+theme_bw()+xlab("Offspring Year")+ggtitle("Age at Maturity")

Total Lifetime Fitness

In this section, we calculate the number of offspring assigned to parents from the pedigree and calculate summary statistics.

#first let's get a dataframe that can be easily used to calculate parent level information
# all candidate parents, the number of time they appear in the pedigree and their metadata

parents <- meta_data %>%
  filter(year %in% 2011:2017) #checked this against the input parent lists, same size

father_counts <- pedigree %>%
  group_by(father) %>%
  count() %>%
  rename(parent = father)

mother_counts <- pedigree %>%
  group_by(mother) %>%
  count() %>%
  rename(parent = mother)

parent_counts <- bind_rows(mother_counts, father_counts) 
rm(mother_counts)
rm(father_counts)

parents %<>%
  left_join(parent_counts, by = c("sample_id" = "parent")) %>%
  rename(tlf = n) %>%
  mutate(tlf = replace_na(tlf, 0))

Table 4

Here we format the TLF results to match table 4 of the draft report.

kable(parents %>%
  group_by(year, type, geno_sex) %>%
  summarise(N = n(), mean_tlf = mean(tlf), sd_tlf = sd(tlf), range = paste(min(tlf), " -  ", max(tlf))), align = "c", caption = "Total Lifetime Fitness by Parent Group and Sex") %>%
  kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(fixed_thead = T) %>%
  collapse_rows(columns = 1, valign = "top") %>%
  scroll_box(height = "400px")
Total Lifetime Fitness by Parent Group and Sex
year type geno_sex N mean_tlf sd_tlf range
2011 carcass F 104 0.2692308 0.5784272 0 - 3
2011 carcass M 93 0.2473118 0.5833918 0 - 3
2011 outplant F 72 1.1388889 2.1050279 0 - 8
2011 outplant M 77 0.8701299 2.1234942 0 - 15
2012 carcass F 45 0.2888889 0.6260345 0 - 2
2012 carcass M 39 0.3076923 0.5691104 0 - 2
2012 outplant F 146 1.1301370 2.0452256 0 - 10
2012 outplant M 112 1.3125000 2.4715726 0 - 17
2013 carcass F 20 0.1000000 0.3077935 0 - 1
2013 carcass M 19 0.0526316 0.2294157 0 - 1
2013 outplant F 478 0.4497908 0.9585705 0 - 6
2013 outplant M 647 0.3431221 0.8564334 0 - 7
2013 reintro F 165 0.3939394 0.7626732 0 - 5
2013 reintro M 389 0.1902314 0.5741935 0 - 5
2014 carcass F 23 0.0434783 0.2085144 0 - 1
2014 carcass M 23 0.0434783 0.2085144 0 - 1
2014 outplant F 292 0.2431507 0.7592189 0 - 8
2014 outplant M 569 0.1405975 0.4942490 0 - 5
2014 reintro F 294 0.1224490 0.3675945 0 - 2
2014 reintro M 460 0.0782609 0.3273403 0 - 2
2015 carcass F 8 0.1250000 0.3535534 0 - 1
2015 carcass M 11 0.1818182 0.4045199 0 - 1
2015 carcass_above F 6 0.1666667 0.4082483 0 - 1
2015 carcass_above M 9 0.5555556 1.1303883 0 - 3
2015 outplant F 519 0.8227360 1.5507451 0 - 10
2015 outplant M 523 0.5583174 1.2433511 0 - 8
2015 reintro F 70 0.9285714 1.6967268 0 - 10
2015 reintro M 78 0.7564103 1.6607454 0 - 10
2015 reintro_above F 144 1.3888889 2.4469508 0 - 19
2015 reintro_above M 287 1.1219512 2.7217922 0 - 26
2016 carcass F 10 0.3000000 0.4830459 0 - 1
2016 carcass M 9 0.2222222 0.4409586 0 - 1
2016 outplant F 858 1.3053613 2.0011652 0 - 15
2016 outplant M 452 2.4137168 3.6812086 0 - 30
2016 reintro F 220 0.4454545 0.8340512 0 - 5
2016 reintro M 300 0.3066667 0.8880226 0 - 9
2017 carcass F 8 0.0000000 0.0000000 0 - 0
2017 carcass M 6 0.0000000 0.0000000 0 - 0
2017 outplant F 691 0.0115774 0.1070513 0 - 1
2017 outplant M 913 0.0087623 0.0932473 0 - 1
2017 reintro F 201 0.0497512 0.2179735 0 - 1
2017 reintro M 304 0.0328947 0.1962609 0 - 2

Results Text

The results in text combine estimates of TLF across sexes. Let’s generate the table above again, without splitting moms and dads.

kable(parents %>%
  group_by(year, type) %>%
  summarise(N = n(), mean_tlf = mean(tlf), sd_tlf = sd(tlf), range = paste(min(tlf), " -  ", max(tlf))), align = "c", caption = "Total Lifetime Fitness by Parent Group") %>%
  kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(fixed_thead = T) %>%
  collapse_rows(columns = 1, valign = "top")
Total Lifetime Fitness by Parent Group
year type N mean_tlf sd_tlf range
2011 carcass 197 0.2588832 0.5793952 0 - 3
2011 outplant 149 1.0000000 2.1117432 0 - 15
2012 carcass 84 0.2976190 0.5967737 0 - 2
2012 outplant 258 1.2093023 2.2375446 0 - 17
2013 carcass 39 0.0769231 0.2699528 0 - 1
2013 outplant 1125 0.3884444 0.9023742 0 - 7
2013 reintro 554 0.2509025 0.6422786 0 - 5
2014 carcass 46 0.0434783 0.2061846 0 - 1
2014 outplant 861 0.1753775 0.5989506 0 - 8
2014 reintro 754 0.0954907 0.3440341 0 - 2
2015 carcass 19 0.1578947 0.3746343 0 - 1
2015 carcass_above 15 0.4000000 0.9102590 0 - 3
2015 outplant 1042 0.6900192 1.4104305 0 - 10
2015 reintro 148 0.8378378 1.6743526 0 - 10
2015 reintro_above 431 1.2111369 2.6333225 0 - 26
2016 carcass 19 0.2631579 0.4524139 0 - 1
2016 outplant 1310 1.6877863 2.7511099 0 - 30
2016 reintro 520 0.3653846 0.8675003 0 - 9
2017 carcass 14 0.0000000 0.0000000 0 - 0
2017 outplant 1604 0.0099751 0.0994068 0 - 1
2017 reintro 505 0.0396040 0.2051322 0 - 2

Notes

Math Check

The math here can get a little funky because we combine duo and trio assignments. Let’s manually calculate the values here from the pedigree and check that they match the output of the notebook code above.

Among the 149 outplants in 2011, 72 were female and 77 were male. There were 94 total offspring (18, 33, and 43 in 2014, 2015 and 2016 respectively). Of the 94 total offspring 84 were assigned to a mother and 67 were assigned to a father for fitness female = 1.21 and fitness male = 0.87.

For overall mean fitness we cannot simply divide the number of offspring by number of parents like above because some offspring are assigned to two parents while others are assigned to two. Instead we sum the TLF of each parent and divide by the number of parents. In 2011 the sum of TLF across individual parents is 149 for overall mean fitness of 1. Instead of using the code above here, I combined mothers and fathers from the pedigree and summed 2011 outplants. Calculating fitness this way, if all assignments were trios we need a fitness of 2 to reach an overall CRR of 1.

All looks good!

2015 together

We also present 2015 outplants and reintros together as assignments suggest they freely spawn together and the report is interested in productivity above the damn generally, not just the productivity of outplants OR reintros.

parents %>%
  filter(year == 2015, type %in% c("reintro_above", "outplant")) %>%
  summarise(mean_tlf = mean(tlf), sd_tlf = sd(tlf), count= n())
parents %>%
  filter(year == 2015, type %in% c("reintro_above", "outplant"), tlf == 0) %>%
  summarise(zero_tlf_count= n())
parents %>%
  filter(year == 2015, type %in% c("reintro_above", "outplant"), geno_sex == "M") %>%
  summarise(mean_tlf = mean(tlf), sd_tlf = sd(tlf), count= n())
parents %>%
  filter(year == 2015, type %in% c("reintro_above", "outplant"), geno_sex == "F") %>%
  summarise(mean_tlf = mean(tlf), sd_tlf = sd(tlf), count= n())

2015 comparisons

We should probably build a more complex model here, but for now let’s do a one-liner analysis to see if 2015 reintros above did significantly better than reintros below or outplants above.

abv_data <- filter(parents, year == 2015, type %in% c("outplant", "reintro_above", "reintro")) %>%
  mutate(type = fct_relevel(type, c("reintro_above", "reintro", "outplant")))

above_glm <- glm.nb(tlf ~  geno_sex + type, data = abv_data )
summary(above_glm)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + type, data = abv_data, init.theta = 0.3030093177, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0386  -0.8843  -0.8042   0.1109   3.2339  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.4013     0.1205   3.330  0.00087 ***
## geno_sexM     -0.3230     0.1077  -2.999  0.00271 ** 
## typereintro   -0.4140     0.2008  -2.062  0.03919 *  
## typeoutplant  -0.6267     0.1206  -5.198 2.02e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.303) family taken to be 1)
## 
##     Null deviance: 1204.9  on 1620  degrees of freedom
## Residual deviance: 1172.7  on 1617  degrees of freedom
## AIC: 3827.2
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3030 
##           Std. Err.:  0.0220 
## 
##  2 x log-likelihood:  -3817.1820
rootogram(above_glm)

emmeans( above_glm, "type", type = "response")
##  type          response    SE  df asymp.LCL asymp.UCL
##  reintro_above    1.271 0.126 Inf     1.046     1.544
##  reintro          0.840 0.147 Inf     0.597     1.183
##  outplant         0.679 0.046 Inf     0.595     0.776
## 
## Results are averaged over the levels of: geno_sex 
## Confidence level used: 0.95 
## Intervals are back-transformed from the log scale
plot(emmeans( above_glm, "type", type = "response"))

#TukeyHSD(aov(data = filter(parents, year == 2015, type %in% c("outplant", "reintro_above", "reintro")), tlf ~ type))

ggplot(data = as.data.frame(emmeans( above_glm, "type", type = "response")), aes(x = type, y = response, ymin = response-SE, ymax = response+SE)) + geom_pointrange()+theme_classic()+scale_x_discrete(labels = c("Reintroduced Above\nDetroit", "Reintroduced Below\nBig Cliff", "Outplanted Above\nDetroit"), name = "Type")+ylab("Estimated TLF")

Type is a significant predictor of fitness.

NOR salmon had significantly higher fitness HOR salmon in the above Detroit habitat, but not compared to NOR salmon reintroduced to below Big Cliff. Note this result should NOT be trusted, the factors are mispecified, rather than compare the three groups, the effects of origin and habitat should treated as separate factors. We should also consider the effects of other covariates and/or random effects.

Cohort Replacement Rates

Here we estimate the cohort replacement rate across different groups of parents.

Results are prepared for each of the following groups:

  • Outplants (Above Detroit)
  • Reintroductions (Below Big Cliff)
  • Reintroductions (Above Detroit)
  • All abobve Detroit

Within each of the above, calculate for each parent year and each sex.

# data wrangling here is very similar to TLF, maybe we can save some work and use it again

# add offspring sex to pedigree_meta
full_meta <- readxl::read_xlsx("../parentage/parentage_data/full_dataset.xlsx", sheet = 1, col_types = "text")

pedigree_meta %<>%
  left_join(select(full_meta, geno_sex, sample_id), by = c("offspring_sample_id" = "sample_id")) %>%
  mutate(offspring_sex = geno_sex)

father_male_offspring_counts <- pedigree_meta %>%
  filter(offspring_sex == "M") %>%
  group_by(father) %>%
  count() %>%
  rename(parent = father)

mother_female_offspring_counts <- pedigree_meta %>%
  filter(offspring_sex == "F") %>%
  group_by(mother) %>%
  count() %>%
  rename(parent = mother)

parent_counts_same_sex <- bind_rows(mother_female_offspring_counts, father_male_offspring_counts) 
rm(father_male_offspring_counts)
rm(mother_female_offspring_counts)

parents %<>%
  left_join(parent_counts_same_sex, by = c("sample_id" = "parent")) %>%
  rename(same_sex_offspring = n) %>%
  mutate(same_sex_offspring = replace_na(same_sex_offspring, 0))

kable(parents %>%
  group_by(year, type, geno_sex) %>%
  summarise(crr = sum(same_sex_offspring)/n()), align = "c", digits = 2) %>%
  kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(fixed_thead = T) %>%
  scroll_box(height = "400px")
year type geno_sex crr
2011 carcass F 0.07
2011 carcass M 0.19
2011 outplant F 0.54
2011 outplant M 0.49
2012 carcass F 0.04
2012 carcass M 0.15
2012 outplant F 0.43
2012 outplant M 0.77
2013 carcass F 0.00
2013 carcass M 0.00
2013 outplant F 0.18
2013 outplant M 0.20
2013 reintro F 0.13
2013 reintro M 0.12
2014 carcass F 0.00
2014 carcass M 0.00
2014 outplant F 0.13
2014 outplant M 0.07
2014 reintro F 0.04
2014 reintro M 0.05
2015 carcass F 0.00
2015 carcass M 0.09
2015 carcass_above F 0.17
2015 carcass_above M 0.22
2015 outplant F 0.33
2015 outplant M 0.33
2015 reintro F 0.41
2015 reintro M 0.44
2015 reintro_above F 0.55
2015 reintro_above M 0.66
2016 carcass F 0.20
2016 carcass M 0.11
2016 outplant F 0.45
2016 outplant M 1.60
2016 reintro F 0.24
2016 reintro M 0.14
2017 carcass F 0.00
2017 carcass M 0.00
2017 outplant F 0.00
2017 outplant M 0.01
2017 reintro F 0.00
2017 reintro M 0.03
# Let's check one of these to make sure the code is working correctly. In 2016, there were 452 male outplants. These 452 potential fathers appear in the final pedigree 1092 times. Of these 1092 offspring, 723 is male. Therefore the correct CRR for 2016 male outplants is 1.5995575. This matches the table.

The results test also presents the numerator and denominator of CRR not split by sex, but doesn’t call it CRR or do the calculation (just presents the number of outplant parents and the number of offspring assigned to them - for good reason as this could paint an overly rosey picture with uneven sex ratios). Let’s sum these below, so it is easier to write this result section.

a <- pedigree_meta %>%
  mutate(parent_year = coalesce(mother_year, father_year)) %>%
  filter(father_type == "outplant" | mother_type == "outplant") %>% 
  group_by(parent_year) %>%
  summarise(offspring_n = n())

b <- full_meta %>%
  filter(type == "outplant") %>%
  count(year) %>%
  rename(parent_year = year, n_outplants = n)

kable(left_join(a,b), align = "c", caption = "total number of offspring that assign to outplants per parent year") %>% kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(fixed_thead = T) 
total number of offspring that assign to outplants per parent year
parent_year offspring_n n_outplants
2011 94 149
2012 174 258
2013 242 1125
2014 87 861
2015 498 1042
2016 1174 1310
2017 9 1604
rm(a)
rm(b)

The results text also splits the results above according to carcass and non-carcass.

a <- pedigree_meta %>%
  mutate(parent_year = coalesce(mother_year, father_year)) %>%
  filter(father_type == "outplant" | mother_type == "outplant") %>% 
  group_by(parent_year, offspring_type) %>%
  summarise(offspring_n = n())

b <- full_meta %>%
  filter(type == "outplant") %>%
  count(year) %>%
  rename(parent_year = year, n_outplants = n)

kable(left_join(a,b), align = "c", caption = "total number of offspring that assign to outplants per parent year") %>% kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(fixed_thead = T) 
total number of offspring that assign to outplants per parent year
parent_year offspring_type offspring_n n_outplants
2011 carcass 3 149
2011 reintro 69 149
2011 reintro_above 22 149
2012 carcass 2 258
2012 reintro 167 258
2012 reintro_above 5 258
2013 carcass 3 1125
2013 reintro 239 1125
2014 carcass 1 861
2014 reintro 86 861
2015 carcass 2 1042
2015 reintro 496 1042
2016 reintro 1174 1310
2017 reintro 9 1604
rm(a)
rm(b)

There are also CRR results presented for Reintros in the text, so we need to combined across sex values here too.

a <- pedigree_meta %>%
  mutate(parent_year = coalesce(mother_year, father_year)) %>%
  filter(father_type == "reintro" | mother_type == "reintro") %>% 
  group_by(parent_year) %>%
  summarise(offspring_n = n())

b <- full_meta %>%
  filter(type == "reintro") %>%
  count(year) %>%
  rename(parent_year = year, n_reintros = n)

kable(left_join(a,b), align = "c", caption = "total number of offspring that assign to reintros per parent year") %>% kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(fixed_thead = T) 
total number of offspring that assign to reintros per parent year
parent_year offspring_n n_reintros
2013 91 554
2014 50 754
2015 83 NA
2016 118 520
2017 13 505
rm(a)
rm(b)

Also need the split into carcass and live:

a <- pedigree_meta %>%
  mutate(parent_year = coalesce(mother_year, father_year)) %>%
  filter(father_type == "reintro" | mother_type == "reintro") %>% 
  group_by(parent_year, offspring_type) %>%
  summarise(offspring_n = n())

b <- full_meta %>%
  filter(type == "reintro") %>%
  count(year) %>%
  rename(parent_year = year, n_reintros = n)

kable(left_join(a,b), align = "c", caption = "total number of offspring that assign to reintros per parent year") %>% kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(fixed_thead = T) 
total number of offspring that assign to reintros per parent year
parent_year offspring_type offspring_n n_reintros
2013 carcass 2 554
2013 reintro 89 554
2014 carcass 1 754
2014 reintro 49 754
2015 carcass 1 NA
2015 reintro 82 NA
2016 reintro 118 520
2017 reintro 13 505
rm(a)
rm(b)

And finally, the 2015 above Detroit reintros (in a future revision of this notebook, consider combining all of these into a single table)

a <- pedigree_meta %>%
  mutate(parent_year = coalesce(mother_year, father_year)) %>%
  filter(father_type == "reintro_above" | mother_type == "reintro_above") %>% 
  group_by(parent_year) %>%
  summarise(offspring_n = n())

b <- full_meta %>%
  filter(type == "reintro_above") %>%
  count(year) %>%
  rename(parent_year = year, n_reintros_above = n)

kable(left_join(a,b), align = "c", caption = "total number of offspring that assign to reintros above Detroit per parent year") %>% kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(fixed_thead = T) 
total number of offspring that assign to reintros above Detroit per parent year
parent_year offspring_n n_reintros_above
2015 396 431
rm(a)
rm(b)

Also need the split into carcass and live:

a <- pedigree_meta %>%
  mutate(parent_year = coalesce(mother_year, father_year)) %>%
  filter(father_type == "reintro_above" | mother_type == "reintro_above") %>% 
  group_by(parent_year, offspring_type) %>%
  summarise(offspring_n = n())

b <- full_meta %>%
  filter(type == "reintro_above") %>%
  count(year) %>%
  rename(parent_year = year, n_reintros = n)

kable(left_join(a,b), align = "c", caption = "total number of offspring that assign to reintros above Detroit per parent year") %>% kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(fixed_thead = T) 
total number of offspring that assign to reintros above Detroit per parent year
parent_year offspring_type offspring_n n_reintros
2015 reintro 396 431
rm(a)
rm(b)

2015 together

kable(parents %>%
        filter(year == 2015, type %in% c("reintro_above", "outplant")) %>%
  group_by(geno_sex) %>%
  summarise(crr = sum(same_sex_offspring)/n()), align = "c", digits = 2, caption = "crr by sex for all salmon outplanted OR reintroduced above detroit dam") %>%
  kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(fixed_thead = T) %>%
  scroll_box(height = "400px")
crr by sex for all salmon outplanted OR reintroduced above detroit dam
geno_sex crr
F 0.38
M 0.44
a <- pedigree_meta %>%
  mutate(parent_year = coalesce(mother_year, father_year)) %>%
  filter(father_type %in% c("reintro_above", "outplant") | mother_type %in% c("reintro_above", "outplant") ) %>% 
  group_by(parent_year) %>%
  summarise(offspring_n = n())

b <- full_meta %>%
  filter(type %in% c("reintro_above", "outplant") ) %>%
  count(year) %>%
  rename(parent_year = year, n__above = n)

kable(left_join(a,b), align = "c", caption = "total number of offspring that assign to reintros or outplants above Detroit per parent year") %>% kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(fixed_thead = T) 
total number of offspring that assign to reintros or outplants above Detroit per parent year
parent_year offspring_n n__above
2011 94 149
2012 174 258
2013 242 1125
2014 87 861
2015 715 1473
2016 1174 1310
2017 9 1604
rm(a)
rm(b)

Predictors of Fitness

Here we fit some GLMs on fitness of outplanted salmon. We split the analysis below into two full model specifications: the specification from the draft report and a different model that might have more power to detect significant effects of variables on fitness.

Fixed Predictors (GLM)

First we fit the model the same way as the draft report. For each outplant year, Total lifetime fitness ~ sex + release location + release date. All are fixed effects and all but date are factors (categorical). The GLM family is Poisson (log link function, Poisson distribution) and the GLM is evaluated using likelihood ratio test (chi-squared distribution) and individual levels of effects are evaluated with a Wald Tests (automatically done by R so included below) and likelihood-ratio tests (to match what was done previously).

I also went a little further than what was done before:
I did model validation by (a) comparing the scaled residuals against rank transformed predicted values and (b) examining a QQ plot for goodness of fit (Kolmogorov–Smirnov test), overdispersion, and outliers. This was added to the report draft. The previous analysis conducted model validation using a Chi-squared goodness of fit test using Pearson residuals. The models failed the test (and therefore model validation) in every year, but this was not reported in the draft.

I also went a little further here, but did not include the follwoing in the report draft. For years with significant full models and significant effects, we conducted stepwise model selection by AIC to find a sparse final model, assessed significance of predictor variables in the final model using a likelihood-ratio test, and assessed significance of individual levels of predictors using a Wald test.

Some other notes:
Note 1:
The previous workers on this project (Evans, Black, Bohn) all used jmp to fit this model. Specifically, in the previous North Santiam (and South Santiam) reports jmp was used to fit a glm with a poisson distribution. However, the data is a poor fit to the distribution: in each year the response variable demonstrates significant overdispersion. Sard solved this problem by fitting their models using a negative binomial in McKenzie, but previous workers on Santiam adjusted the tests using an “overdispersion parameter estimated by Pearson Chisq/DF.” This approach is implemented in the R glm() function by using the “quasipoisson” rather than poisson family for the glm fit, but the authors of the most populat r packages for fitting GLMs believe it is inappropriate to output AIC or likelihood using this “quasipoisson” distribution, while the jmp authors don’t seem to have an issue with reporting AIC/likelihoods when the response variable demonstrates overdispersion and report it. Table 5 reports to log-likelihood ratio of individual effects so to get the table with the exact same results, so we need to make a decision here.

I’d prefer to take the approach Sard took and fit with a different distribution (negative binomial). This seems to be the best compromise between getting this done in a timely manner, and limiting the changes to the previous approach. I take this approach below.

Alternatively we could (a) take the time to figure out how to run jmp, (b) use the quasipoisson (approach from before), but eliminate the log-likelihood ratio from table 5 and only report the p values.

Note 2:
In my original interpretation of the methods section and logs I thought the previous model fits in JMP fit release date as a fixed factor. When I found the actual data provided to JMP I found that Bohn had converted release date to Julian day (continuous numerical predictor). This is a huge relief for interpreting the results, but still isn’t optimal, because it doesn’t take into account the contribution to variance of batch effects of release dates, only considers a linear relationship between julian day and log(TLF). A mixed model can do both (random batch effects + fixed effect of Julian day).
Also, we are ignoring non-linear relationships. What if Julian day is strongly predictive of fitness but not linear - e.g. stabilizing selection with greatest fitness observed in the middle of the distribution but poor fitness at extremes. We don’t explore this possibility in the current approach.

Note 3:
The draft report methods state that interaction terms are fit in the model, but the results table shows model fits without interaction terms. Was there some stage of analysis where the model fits were analyzed with and without interaction terms (i.e. model selection)? Was the data examined for a potential interaction between release date and sex? How was this decision made? It seems like if the likelihood ratio test for interaction term was not significant it was dropped from the model. Should I take the same approach if we decide to leave this part of the report exactly as is? I did not fit the interaction term to follow what was done in the final version of the results, but when we discuss changing this part of the report we should discuss this.

Approach Outline
For each of the years below, I first present the structure of the data to determine if the full model can be fit. Then I fit the full model and test it using a log-likelihood ratio test against a null model that contains only an intercept. If there is a signifcant global/full model and evidence that predictors have significant effect (Wald test + likelihood ratio test), then I proceed to model validation and model selection and present estimates of effects of predictors of fitness.

2011

# data
glm_2011_data <- parents %>%
  filter(type == "outplant", year == "2011") %>%
  select(date, geno_sex, location, tlf) %>%
  drop_na() %>%
  mutate(jday = as.numeric(format(date, "%j"))) %>%
  mutate(date = as.factor(date), geno_sex = as.factor(geno_sex), location = as.factor(location))

str(glm_2011_data)
## tibble [148 × 5] (S3: tbl_df/tbl/data.frame)
##  $ date    : Factor w/ 3 levels "2011-07-21","2011-07-28",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ geno_sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ location: Factor w/ 1 level "mongold": 1 1 1 1 1 1 1 1 1 1 ...
##  $ tlf     : num [1:148] 0 0 0 0 0 1 0 0 1 0 ...
##  $ jday    : num [1:148] 202 202 202 202 202 202 202 209 209 209 ...

For 2011, there is only a single release location and 3 dates. We will not include location in the model.

Let’s fit the model and test for global significance.

#glm_2011 <- glm.nb(tlf ~ geno_sex + date +geno_sex*date , data = glm_2011_data)
glm_2011_full <- glm.nb(tlf ~ geno_sex + jday  , data = glm_2011_data)
glm_2011_null <- glm.nb(tlf ~ 1 , data = glm_2011_data)

# test for global
anova(glm_2011_null, glm_2011_full, test = "Chisq")
summary(glm_2011_full)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday, data = glm_2011_data, 
##     init.theta = 0.2641415947, link = log)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.94262  -0.93766  -0.88036   0.05845   2.45947  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.226326   5.750039  -0.039    0.969
## geno_sexM   -0.249427   0.361634  -0.690    0.490
## jday         0.001672   0.027127   0.062    0.951
## 
## (Dispersion parameter for Negative Binomial(0.2641) family taken to be 1)
## 
##     Null deviance: 108.46  on 147  degrees of freedom
## Residual deviance: 107.95  on 145  degrees of freedom
## AIC: 385.23
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2641 
##           Std. Err.:  0.0579 
## 
##  2 x log-likelihood:  -377.2310

Let’s conduct LR tests on each predictor.

dropterm(glm_2011_full, test = "Chisq")

The global model is insignificant and Wald and LR Tests of each predictor are not significant. Do not proceed with model selection.

2012

# data
glm_2012_data <- parents %>%
  filter(type == "outplant", year == "2012") %>%
  select(date, geno_sex, location, tlf) %>%
  drop_na() %>%
  mutate(jday = as.numeric(format(date, "%j"))) %>%
  mutate(date = as.factor(date), geno_sex = as.factor(geno_sex), location = as.factor(location))

str(glm_2012_data)
## tibble [258 × 5] (S3: tbl_df/tbl/data.frame)
##  $ date    : Factor w/ 10 levels "2012-06-01","2012-06-14",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ geno_sex: Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ location: Factor w/ 2 levels "breitenbush river",..: 2 2 2 2 2 2 2 2 2 1 ...
##  $ tlf     : num [1:258] 0 0 0 0 8 0 0 0 1 3 ...
##  $ jday    : num [1:258] 153 153 153 153 153 153 153 153 153 166 ...

For 2012, we will fit the full model

Let’s fit the model and test for global significance.

#glm_2011 <- glm.nb(tlf ~ geno_sex + date +geno_sex*date , data = glm_2011_data)
glm_2012_full <- glm.nb(tlf ~ geno_sex + jday + location  , data = glm_2012_data)
glm_2012_null <- glm.nb(tlf ~ 1 , data = glm_2012_data)

# test for global
anova(glm_2012_null, glm_2012_full, test = "Chisq")
summary(glm_2012_full)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + location, data = glm_2012_data, 
##     init.theta = 0.3395078158, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1258  -1.0000  -0.9525   0.1211   2.3462  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)
## (Intercept)     1.703592   1.187964   1.434    0.152
## geno_sexM       0.181026   0.244296   0.741    0.459
## jday           -0.008681   0.006948  -1.249    0.212
## locationhoover  0.061619   0.374671   0.164    0.869
## 
## (Dispersion parameter for Negative Binomial(0.3395) family taken to be 1)
## 
##     Null deviance: 213.28  on 257  degrees of freedom
## Residual deviance: 211.17  on 254  degrees of freedom
## AIC: 748.92
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3395 
##           Std. Err.:  0.0539 
## 
##  2 x log-likelihood:  -738.9180

Let’s conduct LR tests on each predictor.

dropterm(glm_2012_full, test = "Chisq")

The global model is insignificant and Wald Tests of each predictor are not significant. Do not proceed with model selection

2013

# data
glm_2013_data <- parents %>%
  filter(type == "outplant", year == "2013") %>%
  select(date, geno_sex, location, tlf) %>%
  drop_na() %>%
  mutate(jday = as.numeric(format(date, "%j"))) %>%
  mutate(date = as.factor(date), geno_sex = as.factor(geno_sex), location = as.factor(location))

str(glm_2013_data)
## tibble [1,115 × 5] (S3: tbl_df/tbl/data.frame)
##  $ date    : Factor w/ 23 levels "2013-06-10","2013-06-13",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ geno_sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ location: Factor w/ 3 levels "hoover","kanes",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ tlf     : num [1:1115] 1 0 1 0 0 0 0 0 0 0 ...
##  $ jday    : num [1:1115] 161 161 161 161 161 161 161 161 161 161 ...

For 2013, we will fit the full model

Let’s fit the model and test for global significance.

#glm_2011 <- glm.nb(tlf ~ geno_sex + date +geno_sex*date , data = glm_2011_data)
glm_2013_full <- glm.nb(tlf ~ geno_sex + jday + location , data = glm_2013_data)
glm_2013_null <- glm.nb(tlf ~ 1 , data = glm_2013_data)

# test for global
anova(glm_2013_null, glm_2013_full, test = "Chisq")
summary(glm_2013_full)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + location, data = glm_2013_data, 
##     init.theta = 0.309686153, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7650  -0.7081  -0.6910  -0.6456   2.7795  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     -0.6994831  0.8144872  -0.859   0.3904  
## geno_sexM       -0.2466895  0.1465946  -1.683   0.0924 .
## jday            -0.0003861  0.0040253  -0.096   0.9236  
## locationkanes   -0.1859461  0.1762224  -1.055   0.2913  
## locationmongold  0.0778672  0.3148751   0.247   0.8047  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3097) family taken to be 1)
## 
##     Null deviance: 660.41  on 1114  degrees of freedom
## Residual deviance: 655.68  on 1110  degrees of freedom
## AIC: 1777.1
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3097 
##           Std. Err.:  0.0396 
## 
##  2 x log-likelihood:  -1765.0660

Let’s conduct LR tests on each predictor.

dropterm(glm_2013_full, test = "Chisq")

The global model is insignificant and Wald and LR Tests of each predictor are not significant. Do not proceed with model selection.

2014

# data
glm_2014_data <- parents %>%
  filter(type == "outplant", year == "2014") %>%
  select(date, geno_sex, location, tlf) %>%
  drop_na() %>%
  mutate(jday = as.numeric(format(date, "%j"))) %>%
  mutate(date = as.factor(date), geno_sex = as.factor(geno_sex), location = as.factor(location))

str(glm_2014_data)
## tibble [861 × 5] (S3: tbl_df/tbl/data.frame)
##  $ date    : Factor w/ 36 levels "2014-06-17","2014-06-19",..: 2 2 1 1 1 1 1 1 1 1 ...
##  $ geno_sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ location: Factor w/ 5 levels "breitenbush river",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ tlf     : num [1:861] 0 0 0 0 0 0 0 0 0 0 ...
##  $ jday    : num [1:861] 170 170 168 168 168 168 168 168 168 168 ...

For 2014, we will fit the full model

Let’s fit the model and test for global significance.

#glm_2011 <- glm.nb(tlf ~ geno_sex + date +geno_sex*date , data = glm_2011_data)
glm_2014_full <- glm.nb(tlf ~ geno_sex + jday + location   , data = glm_2014_data)
glm_2014_null <- glm.nb(tlf ~ 1 , data = glm_2014_data)

# test for global
anova(glm_2014_null, glm_2014_full, test = "Chisq")
summary(glm_2014_full)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + location, data = glm_2014_data, 
##     init.theta = 0.2259669686, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6018  -0.5557  -0.4709  -0.4523   2.9572  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)  
## (Intercept)           -1.095675   2.329556  -0.470    0.638  
## geno_sexM             -0.549773   0.223700  -2.458    0.014 *
## jday                  -0.001209   0.009360  -0.129    0.897  
## locationcoopers ridge -0.100648   0.854165  -0.118    0.906  
## locationhoover        -0.170866   0.564887  -0.302    0.762  
## locationhorn creek     0.079127   0.465986   0.170    0.865  
## locationkanes         -0.062769   0.533722  -0.118    0.906  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.226) family taken to be 1)
## 
##     Null deviance: 341.36  on 860  degrees of freedom
## Residual deviance: 334.93  on 854  degrees of freedom
## AIC: 827.39
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2260 
##           Std. Err.:  0.0486 
## 
##  2 x log-likelihood:  -811.3900

Let’s conduct LR tests on each predictor.

dropterm(glm_2014_full, test = "Chisq")

The global model is insignificant and but sex has a significant effect (according to Wald Test and LRT). Do not proceed with model selection as this could lead to overfitting (global model not significant). Let’s do model validation though.

simulationOutput <- simulateResiduals(fittedModel = glm_2014_full, plot = F)
plot(simulationOutput)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

Model is a poor fit to the data, likely due to outliers. Because we are dealing with count data, we might need a different approach to making sure outliers are the true source of problem and not model mispecification, but in any case there is dependence between the predicted values and the residuals and the qqplot reveals some issues. The model fit in JMP also failed validation, but the result was not reported.

Let’s convert the paramter estimate to something more easy to interpret (invert the tranformation back to fitness)

exp(coef(glm_2014_full))
##           (Intercept)             geno_sexM                  jday 
##             0.3343140             0.5770810             0.9987913 
## locationcoopers ridge        locationhoover    locationhorn creek 
##             0.9042514             0.8429348             1.0823421 
##         locationkanes 
##             0.9391607

Males have predicted 57% fitness of females in 2014.

My take is that this absolutely should not be included in the draft as a significant effect of sex. We are violating model assumptions. If we are committed to the modeling approach as is, we should explore why model validation failed and evaluate how severe of a problem we are dealing with. Maybe we’re comfortable with this level of goodness of fit? Alternatively, maybe the model needs to be specified differently or we need to remove outliers.

2015

# data
glm_2015_data <- parents %>%
  filter(type %in% c("outplant", "reintro_above"), year == "2015") %>%
  select(date, geno_sex, location, tlf) %>%
  drop_na() %>%
  mutate(jday = as.numeric(format(date, "%j"))) %>%
  mutate(date = as.factor(date), geno_sex = as.factor(geno_sex), location = as.factor(location))

str(glm_2015_data)
## tibble [1,473 × 5] (S3: tbl_df/tbl/data.frame)
##  $ date    : Factor w/ 22 levels "2015-06-12","2015-06-16",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ geno_sex: Factor w/ 2 levels "F","M": 1 2 1 1 1 1 1 1 1 1 ...
##  $ location: Factor w/ 4 levels "breitenbush river",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ tlf     : num [1:1473] 0 2 0 0 0 0 0 0 0 0 ...
##  $ jday    : num [1:1473] 167 167 167 167 167 167 167 167 167 167 ...

For 2015, we will fit the full model

Let’s fit the model and test for global significance.

#glm_2011 <- glm.nb(tlf ~ geno_sex + date +geno_sex*date , data = glm_2011_data)
glm_2015_full <- glm.nb(tlf ~ geno_sex + jday + location  , data = glm_2015_data)
glm_2015_null <- glm.nb(tlf ~ 1 , data = glm_2015_data)

# test for global
anova(glm_2015_null, glm_2015_full, test = "Chisq")
summary(glm_2015_full)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + location, data = glm_2015_data, 
##     init.theta = 0.3133874085, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0779  -0.8828  -0.8203   0.1317   3.0791  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.209470   0.569291  -0.368  0.71291    
## geno_sexM         -0.306089   0.112830  -2.713  0.00667 ** 
## jday              -0.001374   0.002957  -0.465  0.64221    
## locationdry cr     0.990797   0.188104   5.267 1.38e-07 ***
## locationdry creek  0.595059   0.236652   2.514  0.01192 *  
## locationhorn cr    0.471364   0.245730   1.918  0.05508 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3134) family taken to be 1)
## 
##     Null deviance: 1117.8  on 1472  degrees of freedom
## Residual deviance: 1067.5  on 1467  degrees of freedom
## AIC: 3459.1
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3134 
##           Std. Err.:  0.0241 
## 
##  2 x log-likelihood:  -3445.0810

Let’s conduct LR tests on each predictor.

dropterm(glm_2015_full, test = "Chisq")

Global model is significant. Sex has signifcant effect by Wald and LRT. Let’s do model validation.

simulationOutput <- simulateResiduals(fittedModel = glm_2015_full, plot = F)
plot(simulationOutput)

This model also has some issues, though less severe. The magnitude of residuals is not independent of the fitted values. For fun, let’s see if one variable is causing the problem.

plotResiduals(simulationOutput, form = glm_2015_data$geno_sex)

plotResiduals(simulationOutput, form = glm_2015_data$location)

plotResiduals(simulationOutput, form = glm_2015_data$jday)

Nope, all looks good. Let’s save further diagnosis until after we get the first draft (exactly as done before) completed.

We will do model selection though, since it is quick.

msl_2015 <- stepAIC(glm_2015_full, direction = "backward")
msl_2015$anova
anova(msl_2015)
## Warning in anova.negbin(msl_2015): tests made without re-estimating 'theta'
simulationOutput <- simulateResiduals(fittedModel = msl_2015, plot = F)
plot(simulationOutput)

The best final model using stepwise model selection on AIC includes sex and location. It appear getting rid of Julian day solved the model fit issues.Let’s take a look at the coefficients from this model.

summary(msl_2015)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + location, data = glm_2015_data, 
##     init.theta = 0.3132234248, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0633  -0.8800  -0.8289   0.1226   3.1410  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -0.4705     0.1050  -4.480 7.46e-06 ***
## geno_sexM          -0.3027     0.1128  -2.683 0.007290 ** 
## locationdry cr      0.9345     0.1418   6.593 4.32e-11 ***
## locationdry creek   0.5055     0.1358   3.721 0.000198 ***
## locationhorn cr     0.4644     0.2457   1.890 0.058710 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3132) family taken to be 1)
## 
##     Null deviance: 1117.4  on 1472  degrees of freedom
## Residual deviance: 1067.4  on 1468  degrees of freedom
## AIC: 3457.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3132 
##           Std. Err.:  0.0241 
## 
##  2 x log-likelihood:  -3445.2970

I’ll convert the coefficients into something easier to understand by taking the inverse of the log transformation of the effects (e.g. exp(parameter estimate)).

exp(glm_2015_full$coefficients)
##       (Intercept)         geno_sexM              jday    locationdry cr 
##         0.8110140         0.7363208         0.9986273         2.6933808 
## locationdry creek   locationhorn cr 
##         1.8131376         1.6021773

Males have predicted 72% of females in 2015 and Dry Creek is better than Breitenbush

GLMs (Dayan)

I think the most powerful approach may be to use a mixed model, however (1) GLMMs on overdispersed count data are getting into some very recently developed statistics and might be challenging and (2) there are still improvements we can make to the GLMs without incorporating random effects.

In the previous GLM section I attempted to follow what had been done previously. In this section I give myself the same freedom it seems other workers have had when trying to model fitness and explore additional covariates and distributions, I also update the model selection and validation approach, given recent developments in best practices for GLMs.

Outline

  1. EDA - let’s look at the data in greater detail before jumping into modeling.
  2. Choose covariates - do some preliminary model fits and see if we’re missing anything. For example in the mixed model approach where we combine year, we may see that the sex effect is actually due to skewed sex ratios, therefore we can assume that an important interaction term between sex and sex ratio of outplanting that year is missing from the model, leading to poor fit and overdispersion. Can we think of what might be missing from the model?
  3. Choose Distribution - fit full models of Poisson, Quasi-Poisson, NB, ZIP, ZINB and Hurdle models. Does this fix the overdispersion? Make the residuals look good?
  4. Model selection and validation on the final model.

We’ll do our full exploration using 2014 data.

EDA

First let’s look closely at the distribution of the response variable

ggplot(data = glm_2014_data)+geom_histogram(aes(x = tlf))+theme_classic()

Hard to imagine something more zero inflated than that. No wonder the Poisson, Quasi-Poisson and NB models had a hard time.

Next let’s look at some biplots.

##################################################################
##################################################################
#Here are some functions that we took from the pairs help file and
#modified, or wrote ourselves. To cite these, use the r citation: citation()

panel.cor <- function(x, y, digits=1, prefix="", cex.cor = 6)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r1=cor(x,y,use="pairwise.complete.obs")
  r <- abs(cor(x, y,use="pairwise.complete.obs"))
  txt <- format(c(r1, 0.123456789), digits=digits)[1]
  txt <- paste(prefix, txt, sep="")
  if(missing(cex.cor)) { cex <- 0.9/strwidth(txt) } else {
     cex = cex.cor}
  text(0.5, 0.5, txt, cex = cex * r)
}

##################################################################
panel.smooth2=function (x, y, col = par("col"), bg = NA, pch = par("pch"),
                        cex = 1, col.smooth = "black", span = 2/3, iter = 3, ...)
{
  points(x, y, pch = pch, col = col, bg = bg, cex = cex)
  ok <- is.finite(x) & is.finite(y)
  if (any(ok))
    lines(stats::lowess(x[ok], y[ok], f = span, iter = iter),
          col = 1, ...)
}

##################################################################
panel.lines2=function (x, y, col = par("col"), bg = NA, pch = par("pch"),
                       cex = 1, ...)
{
  points(x, y, pch = pch, col = col, bg = bg, cex = cex)
  ok <- is.finite(x) & is.finite(y)
  if (any(ok)){
    tmp=lm(y[ok]~x[ok])
    abline(tmp)}
}

##################################################################
panel.hist <- function(x, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(usr[1:2], 0, 1.5) )
  h <- hist(x, plot = FALSE)
  breaks <- h$breaks; nB <- length(breaks)
  y <- h$counts; y <- y/max(y)
  rect(breaks[-nB], 0, breaks[-1], y, col="white", ...)
}

pairs(select(glm_2014_data, tlf, geno_sex, jday, location), lower.panel = panel.cor, diag.panel = panel.hist, upper.panel = panel.smooth2)

Location and jday will cause some problems. It seems the release location is confounded with the release date. Everything else here looks okay-ish. Will it help us to consider log(tlf).

select(glm_2014_data, tlf, geno_sex, jday, location) %>%
  mutate(tlf = log(tlf+1)) %>%
  pairs(., lower.panel = panel.cor, diag.panel = panel.hist, upper.panel = panel.smooth2)

Let’s also look at the data zero-truncated. Some patterns might be hiding in there but we are blinded by the large number of zeros

select(glm_2014_data, tlf, geno_sex, jday, location) %>%
 filter(tlf !=0) %>%
  mutate(tlf = log(tlf)) %>%
  pairs(., lower.panel = panel.cor, diag.panel = panel.hist, upper.panel = panel.smooth2)

The big takeaways here are:
* we should be wary of instability of parameter estimates of any model that includes both location and jday, because of strong collinearity. Since one is a caterogical we can use the VIF function from the car package to estimate generalized VIFs to evaluate how severe of a problem this is.

Which covariates

Density The first one I can thnk of to pull out the available data is density: How many fish were outplanted at once. Let’s look at this variable.

glm_2014_data %>%
  group_by(jday, location) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday'. You can override using the `.groups`
## argument.
d2014 <- glm_2014_data %>%
  group_by(jday, location) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday'. You can override using the `.groups`
## argument.
glm_2014_data %<>%
  left_join(d2014)
## Joining, by = c("location", "jday")
select(glm_2014_data, tlf, geno_sex, jday, location, density) %>%
  pairs(., lower.panel = panel.cor, diag.panel = panel.hist, upper.panel = panel.smooth2)

Theres a lot of variation in outplanting density, and, perhaps not surprisingly, a relationship between density and julian day and location.

Julian Day I’m also curious about whether the jday covariate should be included as a linear continous predictor. This doesn’t seem to make a lot of biological sense. Let’s look closely at the distribution and it’s relationship to tlf.

ggplot(glm_2014_data)+geom_smooth(aes(jday, tlf), span = 0.5)+geom_point(aes(jday, tlf))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Hard to tell. Let’s transform, zero-truncate and try again.

ggplot(filter(glm_2014_data, tlf !=0))+geom_smooth(aes(jday, log(tlf)), span = 0.5)+geom_point(aes(jday, log(tlf)))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Well, nothing pops out of a simple LOESS with no covariates, but it’s still a possibility. This is best evaluated using models.

M2014_NB <- glm.nb(tlf ~ geno_sex + jday + location+  geno_sex*jday + density +density*geno_sex , data = glm_2014_data)
M2014_NB_non_linear <-  glm.nb(tlf ~ geno_sex + poly(jday,2, raw = TRUE) + location + density +density*geno_sex  , data = glm_2014_data)
summary(M2014_NB)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + location + geno_sex * 
##     jday + density + density * geno_sex, data = glm_2014_data, 
##     init.theta = 0.2534193426, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7157  -0.5264  -0.4681  -0.4134   3.3703  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)   
## (Intercept)           -0.503614   2.539262  -0.198  0.84279   
## geno_sexM             -1.104859   2.002582  -0.552  0.58114   
## jday                  -0.001000   0.010468  -0.096  0.92389   
## locationcoopers ridge -0.450110   1.016843  -0.443  0.65802   
## locationhoover        -0.331191   0.604475  -0.548  0.58376   
## locationhorn creek    -0.007831   0.477667  -0.016  0.98692   
## locationkanes         -0.333137   0.617050  -0.540  0.58928   
## density               -0.014700   0.010783  -1.363  0.17278   
## geno_sexM:jday        -0.003282   0.008260  -0.397  0.69108   
## geno_sexM:density      0.034691   0.012993   2.670  0.00759 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2534) family taken to be 1)
## 
##     Null deviance: 356.78  on 860  degrees of freedom
## Residual deviance: 339.32  on 851  degrees of freedom
## AIC: 822.96
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2534 
##           Std. Err.:  0.0564 
## 
##  2 x log-likelihood:  -800.9610
summary(M2014_NB_non_linear)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + poly(jday, 2, raw = TRUE) + 
##     location + density + density * geno_sex, data = glm_2014_data, 
##     init.theta = 0.2525401808, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7161  -0.5334  -0.4674  -0.4102   3.3334  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -1.405e+00  1.090e+01  -0.129 0.897446    
## geno_sexM                  -1.888e+00  4.923e-01  -3.835 0.000126 ***
## poly(jday, 2, raw = TRUE)1  9.271e-03  1.014e-01   0.091 0.927181    
## poly(jday, 2, raw = TRUE)2 -2.698e-05  2.325e-04  -0.116 0.907628    
## locationcoopers ridge      -3.878e-01  1.061e+00  -0.366 0.714651    
## locationhoover             -3.335e-01  6.182e-01  -0.539 0.589588    
## locationhorn creek         -3.059e-02  5.444e-01  -0.056 0.955181    
## locationkanes              -3.401e-01  6.353e-01  -0.535 0.592410    
## density                    -1.580e-02  1.050e-02  -1.504 0.132473    
## geno_sexM:density           3.686e-02  1.208e-02   3.051 0.002282 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2525) family taken to be 1)
## 
##     Null deviance: 356.31  on 860  degrees of freedom
## Residual deviance: 339.02  on 851  degrees of freedom
## AIC: 823.1
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2525 
##           Std. Err.:  0.0561 
## 
##  2 x log-likelihood:  -801.1030

No, we do not benefit from fitting jday as quadratic.

Let’s do the same comparison for density, as this could also have a non-linear relationship.

ggplot(filter(glm_2014_data, tlf !=0))+geom_smooth(aes(density, log(tlf)))+geom_point(aes(density, log(tlf)))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(filter(glm_2014_data))+geom_smooth(aes(density, (tlf)))+geom_point(aes(density, (tlf)))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

M2014_NB <- glm.nb(tlf ~ geno_sex + jday + location+  geno_sex*jday + density +density*geno_sex , data = glm_2014_data)
M2014_NB_non_linear <-  glm.nb(tlf ~ geno_sex + jday + location  + poly(density,2) +  poly(density,2)*geno_sex  , data = glm_2014_data)
summary(M2014_NB)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + location + geno_sex * 
##     jday + density + density * geno_sex, data = glm_2014_data, 
##     init.theta = 0.2534193426, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7157  -0.5264  -0.4681  -0.4134   3.3703  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)   
## (Intercept)           -0.503614   2.539262  -0.198  0.84279   
## geno_sexM             -1.104859   2.002582  -0.552  0.58114   
## jday                  -0.001000   0.010468  -0.096  0.92389   
## locationcoopers ridge -0.450110   1.016843  -0.443  0.65802   
## locationhoover        -0.331191   0.604475  -0.548  0.58376   
## locationhorn creek    -0.007831   0.477667  -0.016  0.98692   
## locationkanes         -0.333137   0.617050  -0.540  0.58928   
## density               -0.014700   0.010783  -1.363  0.17278   
## geno_sexM:jday        -0.003282   0.008260  -0.397  0.69108   
## geno_sexM:density      0.034691   0.012993   2.670  0.00759 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2534) family taken to be 1)
## 
##     Null deviance: 356.78  on 860  degrees of freedom
## Residual deviance: 339.32  on 851  degrees of freedom
## AIC: 822.96
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2534 
##           Std. Err.:  0.0564 
## 
##  2 x log-likelihood:  -800.9610
summary(M2014_NB_non_linear)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + location + poly(density, 
##     2) + poly(density, 2) * geno_sex, data = glm_2014_data, init.theta = 0.2707107886, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9578  -0.5278  -0.4795  -0.4110   3.2192  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                 -2.741563   2.623771  -1.045  0.29607   
## geno_sexM                   -0.504801   0.224077  -2.253  0.02427 * 
## jday                         0.005786   0.010463   0.553  0.58022   
## locationcoopers ridge       -0.631774   1.025174  -0.616  0.53772   
## locationhoover               0.178601   0.644795   0.277  0.78179   
## locationhorn creek           0.383719   0.504877   0.760  0.44724   
## locationkanes               -0.185517   0.635501  -0.292  0.77035   
## poly(density, 2)1           -3.483227   5.746130  -0.606  0.54439   
## poly(density, 2)2           12.585924   5.518600   2.281  0.02257 * 
## geno_sexM:poly(density, 2)1 17.815414   6.166119   2.889  0.00386 **
## geno_sexM:poly(density, 2)2 -4.197370   6.251159  -0.671  0.50193   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2707) family taken to be 1)
## 
##     Null deviance: 365.66  on 860  degrees of freedom
## Residual deviance: 342.06  on 850  degrees of freedom
## AIC: 819.4
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2707 
##           Std. Err.:  0.0614 
## 
##  2 x log-likelihood:  -795.4030
bind_cols(AIC(M2014_NB_non_linear, M2014_NB), BIC(M2014_NB_non_linear, M2014_NB)) %>%
  select(df = df...1, AIC, BIC)
## New names:
anova( M2014_NB_non_linear, M2014_NB )

Including density as a quadratic improves model fit according to AIC, BIC and LRT, but the improvement is small and only marginaly significant. I think the right thing to do here is to fully explore the possibility of non-linear effects of density using the actual data for each year. Fro the 2014 data, which we use to explore the distributions for fitting, we will include a quadratic.

Distributions

Let’s fit some models! I skip some exploration of some interaction terms, but in general the approach here is to fit a model, do some model selection, and conduct model validation for different distributions. Then I compare the model fits using AIC, LRTs and examining the residuals.

Poisson
M2014_P <- glm(tlf ~ geno_sex + jday + location +poly(density,2)+  geno_sex*jday + geno_sex*poly(density,2) , data = glm_2014_data, family = poisson)

summary(M2014_P)  
## 
## Call:
## glm(formula = tlf ~ geno_sex + jday + location + poly(density, 
##     2) + geno_sex * jday + geno_sex * poly(density, 2), family = poisson, 
##     data = glm_2014_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4920  -0.5947  -0.5236  -0.4353   5.8041  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                 -2.948679   1.965680  -1.500  0.13359   
## geno_sexM                    0.400816   1.473625   0.272  0.78563   
## jday                         0.006720   0.007898   0.851  0.39480   
## locationcoopers ridge       -0.684584   0.777852  -0.880  0.37881   
## locationhoover               0.230708   0.486809   0.474  0.63556   
## locationhorn creek           0.453406   0.376342   1.205  0.22829   
## locationkanes               -0.156352   0.488907  -0.320  0.74912   
## poly(density, 2)1           -2.704907   4.450496  -0.608  0.54334   
## poly(density, 2)2           11.806720   3.892228   3.033  0.00242 **
## geno_sexM:jday              -0.004306   0.006791  -0.634  0.52601   
## geno_sexM:poly(density, 2)1 16.319632   5.156601   3.165  0.00155 **
## geno_sexM:poly(density, 2)2 -3.083407   5.023511  -0.614  0.53935   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 691.73  on 860  degrees of freedom
## Residual deviance: 649.75  on 849  degrees of freedom
## AIC: 903.73
## 
## Number of Fisher Scoring iterations: 6
AER::dispersiontest(M2014_P)
## 
##  Overdispersion test
## 
## data:  M2014_P
## z = 2.7855, p-value = 0.002672
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   1.880578

Welp, it looks the additional covariate is important but didn’t solve our overdispersion problem. The residual deviance is less than the degrees of freedom, and both the density and density*sex interaction have very significant z-statistics. Let’s do some model selection before moving on.

drop1(M2014_P, test = "Chi")

The drop1 approach suggests just the interaction between sex and density is significant. This suggests the final model should include only the two fixed effects and their interaction. Let’s also do this with Wald Tests

M2014_P2 <-  glm(tlf ~ geno_sex + jday  +poly(density,2)+  geno_sex*jday + geno_sex*poly(density,2), family = poisson, data = glm_2014_data) # first drop location
summary(M2014_P2)
## 
## Call:
## glm(formula = tlf ~ geno_sex + jday + poly(density, 2) + geno_sex * 
##     jday + geno_sex * poly(density, 2), family = poisson, data = glm_2014_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3913  -0.5902  -0.5218  -0.4570   5.6689  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                 -2.846695   0.938164  -3.034  0.00241 **
## geno_sexM                    0.466604   1.414254   0.330  0.74145   
## jday                         0.006320   0.004295   1.471  0.14121   
## poly(density, 2)1           -6.964744   3.703410  -1.881  0.06002 . 
## poly(density, 2)2            7.548616   3.393187   2.225  0.02611 * 
## geno_sexM:jday              -0.004563   0.006503  -0.702  0.48294   
## geno_sexM:poly(density, 2)1 16.529300   5.164563   3.201  0.00137 **
## geno_sexM:poly(density, 2)2 -3.707700   4.967362  -0.746  0.45542   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 691.73  on 860  degrees of freedom
## Residual deviance: 655.09  on 853  degrees of freedom
## AIC: 901.06
## 
## Number of Fisher Scoring iterations: 6
# then sex
M2014_P3 <-  glm(tlf ~ geno_sex + jday +poly(density,2)+ geno_sex*poly(density,2) , data = glm_2014_data, family = poisson) # then jday/sex interaction
summary(M2014_P3)
## 
## Call:
## glm(formula = tlf ~ geno_sex + jday + poly(density, 2) + geno_sex * 
##     poly(density, 2), family = poisson, data = glm_2014_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3213  -0.5949  -0.5136  -0.4540   5.7581  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -2.418084   0.705279  -3.429 0.000607 ***
## geno_sexM                   -0.518868   0.172562  -3.007 0.002640 ** 
## jday                         0.004333   0.003225   1.343 0.179144    
## poly(density, 2)1           -7.712715   3.516029  -2.194 0.028265 *  
## poly(density, 2)2            7.040484   3.306166   2.130 0.033213 *  
## geno_sexM:poly(density, 2)1 18.102061   4.643612   3.898 9.69e-05 ***
## geno_sexM:poly(density, 2)2 -2.414316   4.627452  -0.522 0.601853    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 691.73  on 860  degrees of freedom
## Residual deviance: 655.58  on 854  degrees of freedom
## AIC: 899.55
## 
## Number of Fisher Scoring iterations: 6
M2014_P4 <-  glm(tlf ~ geno_sex  +poly(density,2)+ geno_sex*poly(density,2)  , data = glm_2014_data, family = poisson) # now jday
summary(M2014_P4)
## 
## Call:
## glm(formula = tlf ~ geno_sex + poly(density, 2) + geno_sex * 
##     poly(density, 2), family = poisson, data = glm_2014_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1706  -0.5929  -0.5067  -0.4525   5.9664  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -1.4924     0.1274 -11.718  < 2e-16 ***
## geno_sexM                    -0.5101     0.1724  -2.958  0.00310 ** 
## poly(density, 2)1            -9.2202     3.3054  -2.789  0.00528 ** 
## poly(density, 2)2             5.9309     3.1783   1.866  0.06203 .  
## geno_sexM:poly(density, 2)1  18.2439     4.6446   3.928 8.57e-05 ***
## geno_sexM:poly(density, 2)2  -2.6109     4.5869  -0.569  0.56921    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 691.73  on 860  degrees of freedom
## Residual deviance: 657.38  on 855  degrees of freedom
## AIC: 899.36
## 
## Number of Fisher Scoring iterations: 6

Both model selection approaches produce the same result. Let’s look at the model fit.

plot(M2014_P4)

EP <- resid(M2014_P4, type = "pearson")
mu <- predict(M2014_P4, type = "response")
plot(x = mu, y = EP, main = "Pearson residuals")

Also plot residuals against predictors.

ggplot()+geom_boxplot(aes( y = resid(M2014_P4, type = "pearson"), x =M2014_P4$data$geno_sex))

ggplot()+geom_point(aes( y = resid(M2014_P4, type = "pearson"), x =M2014_P4$data$density))+geom_smooth(aes( y = resid(M2014_P4, type = "pearson"), x =M2014_P4$data$density))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot()+geom_point(aes( y = resid(M2014_P4, type = "pearson"), x =M2014_P4$data$density, color = M2014_P4$data$geno_sex))+geom_smooth(aes( y = resid(M2014_P4, type = "pearson"), x =M2014_P4$data$density, color = M2014_P4$data$geno_sex))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

We’ll also try the DHARMa approach, as the expectations for the model validation using Pearson or plain residuals is complicated for a Poisson GLM

simulationOutput <- simulateResiduals(fittedModel = M2014_P, plot = F)
plot(simulationOutput)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

testZeroInflation(simulationOutput)

## 
##  DHARMa zero-inflation test via comparison to expected zeros with
##  simulation under H0 = fitted model
## 
## data:  simulationOutput
## ratioObsSim = 1.042, p-value = 0.008
## alternative hypothesis: two.sided
#plotResiduals(simulationOutput,  glm_2014_data$geno_sex)

We’ll discuss these plots in the summary section. Let’s collect a little more info.

First, how does this model compare to a a null model, and how does it compare to the best model without density (e.g. the approach taken before).

M2014_Pnull <- glm(tlf ~ 1 , data = glm_2014_data, family = poisson)
anova(M2014_Pnull,M2014_P4, test = "Chisq")

The final model is a substantially better fit to the data than a null model.

M2014_Pno_dens_full <- glm(tlf ~ geno_sex + location+ jday+geno_sex*jday , data = glm_2014_data, family = poisson)
summary(M2014_Pno_dens_full)
## 
## Call:
## glm(formula = tlf ~ geno_sex + location + jday + geno_sex * jday, 
##     family = poisson, data = glm_2014_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8342  -0.6063  -0.5466  -0.4906   5.9508  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)  
## (Intercept)           -2.435520   1.797831  -1.355   0.1755  
## geno_sexM              2.191990   1.231679   1.780   0.0751 .
## locationcoopers ridge -0.165486   0.632490  -0.262   0.7936  
## locationhoover        -0.197001   0.420270  -0.469   0.6392  
## locationhorn creek     0.070156   0.340047   0.206   0.8365  
## locationkanes         -0.122423   0.397023  -0.308   0.7578  
## jday                   0.005168   0.007303   0.708   0.4792  
## geno_sexM:jday        -0.012816   0.005700  -2.248   0.0246 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 691.73  on 860  degrees of freedom
## Residual deviance: 674.99  on 853  degrees of freedom
## AIC: 920.97
## 
## Number of Fisher Scoring iterations: 6
drop1(M2014_Pno_dens_full, test = "Chisq")

The best model without density is jday + sex and their interaction. Model validation for this model is below.

M2014_Pno_dens_final <- glm(tlf ~ geno_sex  +jday+geno_sex*jday , data = glm_2014_data, family = poisson)
summary(M2014_Pno_dens_final)
## 
## Call:
## glm(formula = tlf ~ geno_sex + jday + geno_sex * jday, family = poisson, 
##     data = glm_2014_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8493  -0.6030  -0.5455  -0.4888   5.9251  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.007414   0.862807  -3.486 0.000491 ***
## geno_sexM       2.134603   1.227003   1.740 0.081914 .  
## jday            0.007444   0.003931   1.894 0.058277 .  
## geno_sexM:jday -0.012555   0.005678  -2.211 0.027033 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 691.73  on 860  degrees of freedom
## Residual deviance: 675.59  on 857  degrees of freedom
## AIC: 913.56
## 
## Number of Fisher Scoring iterations: 6
plot(M2014_Pno_dens_final)

Summary of Poisson fit

These results suggest adding the density covariate was very important, but we still have an overdispersion problem. QQplots for Poisson will always be problematic, but the DHARMa approach can help us interpret what’s going on.

It seems to me that we are dealing with overdispersion due to zero inflation, but it will be hard to tell without fitting a zero-inflated model and negative binomial. The model is struggling between fitting all the zeros and fitting the tlf > 1. This could explain the break in the qqplot, the DHARMa residuals being worse at higher predicted values and potentially the overdispersion.

NB

Let’s skip the quasipoisson and jump straight to negative binomial.

M2014_NB <- glm.nb(tlf ~ geno_sex + jday + location +poly(density,2)+  geno_sex*jday + geno_sex*poly(density,2) , data = glm_2014_data)

summary(M2014_NB)  
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + location + poly(density, 
##     2) + geno_sex * jday + geno_sex * poly(density, 2), data = glm_2014_data, 
##     init.theta = 0.2713657047, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0256  -0.5082  -0.4683  -0.4084   3.3134  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                 -3.584355   2.727940  -1.314   0.1889  
## geno_sexM                    1.302202   1.936766   0.672   0.5014  
## jday                         0.009808   0.011079   0.885   0.3760  
## locationcoopers ridge       -0.708263   1.027508  -0.689   0.4906  
## locationhoover               0.139661   0.647768   0.216   0.8293  
## locationhorn creek           0.366745   0.505718   0.725   0.4683  
## locationkanes               -0.217132   0.637012  -0.341   0.7332  
## poly(density, 2)1           -1.916505   5.996289  -0.320   0.7493  
## poly(density, 2)2           14.292497   5.652361   2.529   0.0115 *
## geno_sexM:jday              -0.008385   0.008927  -0.939   0.3476  
## geno_sexM:poly(density, 2)1 15.080298   6.805653   2.216   0.0267 *
## geno_sexM:poly(density, 2)2 -6.703253   6.726869  -0.996   0.3190  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2714) family taken to be 1)
## 
##     Null deviance: 365.98  on 860  degrees of freedom
## Residual deviance: 341.52  on 849  degrees of freedom
## AIC: 820.56
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2714 
##           Std. Err.:  0.0613 
## 
##  2 x log-likelihood:  -794.5640
drop1(M2014_NB, test = "Chi")
M2014_NB2 <-  glm.nb(tlf ~ geno_sex + jday +poly(density,2)+  geno_sex*jday + geno_sex*poly(density,2) , data = glm_2014_data) # first drop location
summary(M2014_NB2)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + poly(density, 2) + geno_sex * 
##     jday + geno_sex * poly(density, 2), data = glm_2014_data, 
##     init.theta = 0.2623846579, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9908  -0.5182  -0.4517  -0.4186   3.2491  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                 -3.619156   1.399603  -2.586  0.00971 **
## geno_sexM                    1.201567   1.899157   0.633  0.52694   
## jday                         0.009832   0.006442   1.526  0.12697   
## poly(density, 2)1           -6.394979   5.134013  -1.246  0.21291   
## poly(density, 2)2           10.434882   4.940848   2.112  0.03469 * 
## geno_sexM:jday              -0.007898   0.008756  -0.902  0.36704   
## geno_sexM:poly(density, 2)1 15.990142   6.823408   2.343  0.01911 * 
## geno_sexM:poly(density, 2)2 -6.981913   6.695046  -1.043  0.29702   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2624) family taken to be 1)
## 
##     Null deviance: 361.46  on 860  degrees of freedom
## Residual deviance: 339.77  on 853  degrees of freedom
## AIC: 815.02
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2624 
##           Std. Err.:  0.0586 
## 
##  2 x log-likelihood:  -797.0160
# then sex
M2014_NB3 <-  glm.nb(tlf ~ geno_sex + jday + poly(density,2) + geno_sex*poly(density,2) , data = glm_2014_data) # then jday/sex interaction
summary(M2014_NB3)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + poly(density, 2) + geno_sex * 
##     poly(density, 2), data = glm_2014_data, init.theta = 0.2615858065, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9236  -0.5252  -0.4652  -0.4155   3.1629  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                 -2.666990   0.949697  -2.808  0.00498 **
## geno_sexM                   -0.500656   0.224857  -2.227  0.02598 * 
## jday                         0.005414   0.004362   1.241  0.21462   
## poly(density, 2)1           -7.743944   4.835767  -1.601  0.10929   
## poly(density, 2)2            8.827393   4.715497   1.872  0.06121 . 
## geno_sexM:poly(density, 2)1 18.563910   6.170363   3.009  0.00262 **
## geno_sexM:poly(density, 2)2 -4.645060   6.233936  -0.745  0.45620   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2616) family taken to be 1)
## 
##     Null deviance: 361.05  on 860  degrees of freedom
## Residual deviance: 340.13  on 854  degrees of freedom
## AIC: 813.77
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2616 
##           Std. Err.:  0.0586 
## 
##  2 x log-likelihood:  -797.7650
M2014_NB4 <-  glm.nb(tlf ~ geno_sex  +poly(density,2)+ poly(density,2) , data = glm_2014_data) # now jday
summary(M2014_NB4)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + poly(density, 2) + poly(density, 
##     2), data = glm_2014_data, init.theta = 0.2335681815, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6743  -0.5337  -0.4701  -0.4350   2.9996  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -1.4546     0.1720  -8.459   <2e-16 ***
## geno_sexM          -0.5171     0.2226  -2.322   0.0202 *  
## poly(density, 2)1   0.6975     3.0283   0.230   0.8178    
## poly(density, 2)2   5.8348     3.0826   1.893   0.0584 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2336) family taken to be 1)
## 
##     Null deviance: 345.81  on 860  degrees of freedom
## Residual deviance: 336.11  on 857  degrees of freedom
## AIC: 818.24
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2336 
##           Std. Err.:  0.0507 
## 
##  2 x log-likelihood:  -808.2410

Same results as for the Poisson so far. Model selection by both Wald tests and LRT suggests a final model with sex, density and their interaction. Let’s look at the model fit.

plot(M2014_NB4)

EP <- resid(M2014_NB4, type = "pearson")
mu <- predict(M2014_NB4, type = "response")
plot(x = mu, y = EP, main = "Pearson residuals")

simulationOutput <- simulateResiduals(fittedModel = M2014_NB4, plot = F)
plot(simulationOutput)

testZeroInflation(simulationOutput)

## 
##  DHARMa zero-inflation test via comparison to expected zeros with
##  simulation under H0 = fitted model
## 
## data:  simulationOutput
## ratioObsSim = 0.9998, p-value = 0.992
## alternative hypothesis: two.sided
#plotResiduals(simulationOutput,  glm_2014_data$geno_sex)
AIC(M2014_P4, M2014_NB4)

Summary NB

Allowing for separate variance and mean estimation (e.g. poisson -> negative binomial) looks like it improved the fit substantially. We no longer see evidence of overdispersion. The only flag comes from DHARMa’s outlier test, but the leverage plot suggests nothing too severe is going on. This model fit looks good.

Zero inflation - ZIP

I choose to examine mixture models as well as two-part/hurdle models. I think the zeros include both true and false values and given the low mean fitness in the system, I expect many parents to produce zero offspring and for our covariates to have some impact, therefore a mixture model is probably a better fit. Here we fit a zero inflated poisson.

For the zero-inflation models I fit the binomial/logit part of the model with an intercept only, since the zeros I’m trying to model here have nothing to do with our covariates.

M2014_ZIP <- zeroinfl(tlf ~ geno_sex + jday + location +poly(density,2)+  geno_sex*jday + geno_sex*poly(density,2) | 1, data = glm_2014_data)
summary(M2014_ZIP)
## 
## Call:
## zeroinfl(formula = tlf ~ geno_sex + jday + location + poly(density, 2) + 
##     geno_sex * jday + geno_sex * poly(density, 2) | 1, data = glm_2014_data)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5839 -0.3415 -0.3123 -0.2753 13.4970 
## 
## Count model coefficients (poisson with log link):
##                               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                  -4.921659   2.434317  -2.022  0.04320 * 
## geno_sexM                     2.458040   1.802451   1.364  0.17266   
## jday                          0.020658   0.009804   2.107  0.03511 * 
## locationcoopers ridge         0.152304   0.952261   0.160  0.87293   
## locationhoover                0.486270   0.570816   0.852  0.39428   
## locationhorn creek            0.541900   0.427402   1.268  0.20484   
## locationkanes                 0.117100   0.580648   0.202  0.84017   
## poly(density, 2)1            -1.738541   5.349301  -0.325  0.74518   
## poly(density, 2)2            17.805699   5.438811   3.274  0.00106 **
## geno_sexM:jday               -0.013553   0.008271  -1.639  0.10130   
## geno_sexM:poly(density, 2)1  12.088311   6.235598   1.939  0.05255 . 
## geno_sexM:poly(density, 2)2 -12.051331   6.481831  -1.859  0.06299 . 
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.9524     0.1939   4.912 9.03e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 32 
## Log-likelihood:  -405 on 13 Df

Already we are seeing something interesting here.

Instead of model selection on a ZIP, we’ll assume the same covariates are important from the poisson side of the model.

Same results.

M2014_ZIP4 <- pscl::zeroinfl(tlf ~ geno_sex +poly(density,2) + geno_sex*poly(density,2) | 1, data = glm_2014_data)
## Registered S3 methods overwritten by 'pscl':
##   method                 from    
##   print.zeroinfl         countreg
##   print.summary.zeroinfl countreg
##   summary.zeroinfl       countreg
##   coef.zeroinfl          countreg
##   vcov.zeroinfl          countreg
##   logLik.zeroinfl        countreg
##   predict.zeroinfl       countreg
##   residuals.zeroinfl     countreg
##   fitted.zeroinfl        countreg
##   terms.zeroinfl         countreg
##   model.matrix.zeroinfl  countreg
##   extractAIC.zeroinfl    countreg
##   print.hurdle           countreg
##   print.summary.hurdle   countreg
##   summary.hurdle         countreg
##   coef.hurdle            countreg
##   vcov.hurdle            countreg
##   logLik.hurdle          countreg
##   predict.hurdle         countreg
##   residuals.hurdle       countreg
##   fitted.hurdle          countreg
##   terms.hurdle           countreg
##   model.matrix.hurdle    countreg
##   extractAIC.hurdle      countreg
summary(M2014_ZIP4)
## 
## Call:
## pscl::zeroinfl(formula = tlf ~ geno_sex + poly(density, 2) + geno_sex * 
##     poly(density, 2) | 1, data = glm_2014_data)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4862 -0.3388 -0.3123 -0.2865 13.3049 
## 
## Count model coefficients (poisson with log link):
##                             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                  -0.1884     0.1813  -1.039   0.2988  
## geno_sexM                    -0.4806     0.2042  -2.354   0.0186 *
## poly(density, 2)1            -7.1034     4.1918  -1.695   0.0901 .
## poly(density, 2)2             7.5617     3.9687   1.905   0.0567 .
## geno_sexM:poly(density, 2)1  14.7222     5.7614   2.555   0.0106 *
## geno_sexM:poly(density, 2)2  -5.4823     5.6392  -0.972   0.3310  
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.0187     0.1895   5.376 7.62e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 31 
## Log-likelihood: -409.3 on 7 Df

There is no automatic model validation plot method built into the plot function, so let’s do it ourselves.

plot(fitted(M2014_ZIP4), resid(M2014_ZIP4, type = "pearson"))

Zero-inflation ZINB
M2014_ZINB <- zeroinfl(tlf ~ geno_sex + jday + location +poly(density,2)+  geno_sex*jday + geno_sex*poly(density,2) |1 , data = glm_2014_data, dist = "negbin")

summary(M2014_ZINB)
## 
## Call:
## zeroinfl(formula = tlf ~ geno_sex + jday + location + poly(density, 2) + 
##     geno_sex * jday + geno_sex * poly(density, 2) | 1, data = glm_2014_data, 
##     dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4822 -0.3207 -0.3000 -0.2677 13.0899 
## 
## Count model coefficients (negbin with log link):
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -3.575789   2.898266  -1.234   0.2173    
## geno_sexM                    1.283684   1.996354   0.643   0.5202    
## jday                         0.009767   0.011818   0.827   0.4085    
## locationcoopers ridge       -0.703043   1.052218  -0.668   0.5040    
## locationhoover               0.138450   0.658993   0.210   0.8336    
## locationhorn creek           0.365129   0.507626   0.719   0.4720    
## locationkanes               -0.216372   0.657110  -0.329   0.7419    
## poly(density, 2)1           -2.165377   6.052138  -0.358   0.7205    
## poly(density, 2)2           14.283869   6.061599   2.356   0.0185 *  
## geno_sexM:jday              -0.008296   0.009222  -0.900   0.3683    
## geno_sexM:poly(density, 2)1 15.458514   6.872762   2.249   0.0245 *  
## geno_sexM:poly(density, 2)2 -6.772425   7.002448  -0.967   0.3335    
## Log(theta)                  -1.303498   0.231734  -5.625 1.86e-08 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -6.88      33.22  -0.207    0.836
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.2716 
## Number of iterations in BFGS optimization: 32 
## Log-likelihood: -397.3 on 14 Df

Now we can check if the ZIP model solves the problem of overdispersion.

require(lmtest)
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 3.6.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.6.2
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
lrtest( M2014_ZIP, M2014_ZINB)

This overwhelmingly suggests that the zero inflated NB is better than zero-inflated model. Let’s not take the time to learn model selection for right now and assume the same model as non-zero-inflated models so far

# can't easily use the drop1 function here, let's conduct some LRTs 
# we'll use the wald test do make some informed decisions
M2014_ZINB4 <- zeroinfl(tlf ~ geno_sex  +poly(density,2) + geno_sex*poly(density,2) |1 , data = glm_2014_data, dist = "negbin") #jday worst in both sides of the model

lrtest(M2014_ZIP4, M2014_ZINB4)
summary(M2014_ZINB4)
## 
## Call:
## zeroinfl(formula = tlf ~ geno_sex + poly(density, 2) + geno_sex * poly(density, 
##     2) | 1, data = glm_2014_data, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4392 -0.3215 -0.2938 -0.2707 13.0493 
## 
## Count model coefficients (negbin with log link):
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -1.5007     0.1736  -8.645  < 2e-16 ***
## geno_sexM                    -0.5011     0.2241  -2.236  0.02534 *  
## poly(density, 2)1            -9.4631     4.6241  -2.046  0.04071 *  
## poly(density, 2)2             6.9526     4.7433   1.466  0.14271    
## geno_sexM:poly(density, 2)1  18.3889     6.1992   2.966  0.00301 ** 
## geno_sexM:poly(density, 2)2  -3.9010     6.3915  -0.610  0.54163    
## Log(theta)                   -1.3518     0.2248  -6.014 1.81e-09 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -8.345     65.830  -0.127    0.899
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.2588 
## Number of iterations in BFGS optimization: 257 
## Log-likelihood: -399.6 on 8 Df

Here the zero-inflation part of the model isn’t doing much. The NB is catching most of the overdispersion we observed earlier

Zero inflation hurdle nb

For the Hurdle we need to include the covariates on the zero part of the model.

M2014_HNB4 <- hurdle(tlf ~ geno_sex  +poly(density,2) + geno_sex*poly(density,2) , data = glm_2014_data, dist = "negbin") #jday worst in both sides of the model
summary(M2014_HNB4)
## 
## Call:
## hurdle(formula = tlf ~ geno_sex + poly(density, 2) + geno_sex * poly(density, 
##     2), data = glm_2014_data, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4336 -0.3398 -0.2680 -0.2468 11.9331 
## 
## Count model coefficients (truncated negbin with log link):
##                             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                 -10.0867   106.8680  -0.094   0.9248  
## geno_sexM                    -0.2237     0.4802  -0.466   0.6413  
## poly(density, 2)1            -4.8309     9.0653  -0.533   0.5941  
## poly(density, 2)2            18.3610    10.2839   1.785   0.0742 .
## geno_sexM:poly(density, 2)1   5.0283    13.0260   0.386   0.6995  
## geno_sexM:poly(density, 2)2 -19.3272    14.0908  -1.372   0.1702  
## Log(theta)                  -10.0732   106.8729  -0.094   0.9249  
## Zero hurdle model coefficients (binomial with logit link):
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -1.7276     0.1666 -10.370  < 2e-16 ***
## geno_sexM                    -0.4672     0.2193  -2.131  0.03311 *  
## poly(density, 2)1            -9.7754     4.7434  -2.061  0.03932 *  
## poly(density, 2)2             1.9248     4.5045   0.427  0.66915    
## geno_sexM:poly(density, 2)1  20.0356     6.2196   3.221  0.00128 ** 
## geno_sexM:poly(density, 2)2   2.2995     6.0887   0.378  0.70568    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta: count = 0
## Number of iterations in BFGS optimization: 77 
## Log-likelihood: -396.5 on 13 Df
Compare

Let’s compare models. First let’s use a rootogram.

rootogram(M2014_P4, main = "Poisson")

rootogram(M2014_NB4, main = "Negative Binomial")

rootogram(M2014_ZIP4, main = "Zero-Inflated Poisson")

rootogram(M2014_ZINB4, main = "Zero-Inflated Negative Binomial")

rootogram(M2014_HNB4, main = "Hurdle Negative Binomial")

Now let’s look at a version of qqplots for discrete data. Here we plot randomized quantile residuals against theoretical quantiles.

qqrplot(M2014_P4, main = "Poisson")

qqrplot(M2014_NB4, main = "Negative Binomial")

qqrplot(M2014_ZIP4, main = "Zero-Inflated Poisson")

qqrplot(M2014_ZINB4, main = "Zero-Inflated Negative Binomial")

qqrplot(M2014_HNB4, main = "Hurdle Negative Binomial")

Now let’s look at AIC and BIC

AIC(M2014_ZIP4, M2014_ZINB4, M2014_P4, M2014_NB4, M2014_HNB4)
BIC(M2014_ZIP4, M2014_ZINB4, M2014_P4, M2014_NB4, M2014_HNB4)

Finally, let’s conduct some likelihood ratio tests on nested models. Most of these are already done above, but collecting thme here.

lrtest( M2014_ZIP4, M2014_ZINB4 )
lrtest( M2014_P4, M2014_NB4 )

Discussion

The rootograms tell us that all models fit the majority of observations (where tlf = 0) quite well. The difference seems to be how well fitness of >0 are fit. Models that do not allow for overdispersion (Poisson and ZIP) are worse than the other models. Negative binomial, hurdle negative binomial and zero-inflated negative bionomial all are roughly equivalent, with the more complex hurdle and zero-inflated negative binomial performing only a little better. The same follows for the qq plots based on randomized residuals. This would suggest we do not need the sophistication and more challenging to interpret zero-inflated/hurdle models, and a “simple” negative binomial glm will suffice.
The LRTs of nested models tell us that we definitely need to deal with overdispersion.
When we use AIC or BIC, we also find that the negative binomial is the best model.

All ways to look at this point the same direction. Use the negative binomial.

2011

glm_2011_data %>%
  group_by(jday, location) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday'. You can override using the `.groups`
## argument.
d2011 <- glm_2011_data %>%
  group_by(jday, location) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday'. You can override using the `.groups`
## argument.
glm_2011_data %<>%
  left_join(d2011)
## Joining, by = c("location", "jday")
f <- glm_2011_data  %>% 
  filter(geno_sex == "F") %>%
  group_by(location, jday) %>% 
  summarise(n_female_rg = n()) 
## `summarise()` has grouped output by 'location'. You can override using the
## `.groups` argument.
glm_2011_data  %<>%
  left_join(f)
## Joining, by = c("location", "jday")
m <- glm_2011_data  %>% 
  filter(geno_sex == "M") %>%
  group_by(location, jday) %>% 
  summarise(n_male_rg = n()) 
## `summarise()` has grouped output by 'location'. You can override using the
## `.groups` argument.
glm_2011_data  %<>%
  left_join(m) %>%
  mutate(sex_ratio_rg = n_male_rg/n_female_rg,
         sex_ratio_rg_l = log(sex_ratio_rg))
## Joining, by = c("location", "jday")
select(glm_2011_data, tlf, geno_sex, jday, density, sex_ratio_rg_l ) %>%
  pairs(., lower.panel = panel.cor, diag.panel = panel.hist, upper.panel = panel.smooth2)

There are only three release days and one release location in 2011. Since density and sex ratio is calculated per release site, per day, this means we may have strong collinearity. Let’s check by fitting a model without interaction (below) and calculating VIFs

M2011_NBx <- glm.nb(tlf ~ geno_sex   +jday + sex_ratio_rg_l  + density , data = glm_2011_data)

M2011_NB <- glm.nb(tlf ~ geno_sex   +jday + sex_ratio_rg_l  , data = glm_2011_data)
M2011_NBa <- glm.nb(tlf ~ geno_sex +density  +jday   , data = glm_2011_data)
M2011_NBb <- glm.nb(tlf ~ geno_sex  +jday  + sex_ratio_rg_l  , data = glm_2011_data)
#summary(M2011_NB)  
#summary(M2011_NBa) 
#summary(M2011_NBb) 

Cannot fit the full model because of the strong collinearity between sex ratio, density and date. So fit four models, dropping one predictor from each. Is mutlicollinearity still an issue in the data after?

#vif(M2011_NBx)
vif(M2011_NB)
##       geno_sex           jday sex_ratio_rg_l 
##       1.027110       1.673455       1.701048
vif(M2011_NBa)
## geno_sex  density     jday 
## 1.027110 1.079267 1.079971
vif(M2011_NBb)
##       geno_sex           jday sex_ratio_rg_l 
##       1.027110       1.673455       1.701048

No. We can expect reasonable standard errors of the main effect estimates for a model that includes all three. Let’s choose the most informative by AIC and not forget that unincluded variable may be confounding the fit when we interpret the results

AIC(M2011_NB, M2011_NBa, M2011_NBb)

All fit the data equally well. Which suggests we still have an issue with correlation. We’ll leave out jday and density.

First, we’ll use a LRT and AIC to examine if it is worth including the non-linear effect of sex ratio

M2011_NB <- glm.nb(tlf ~ geno_sex  +density +density*geno_sex , data = glm_2011_data)
M2011_NB3 <- glm.nb(tlf ~ geno_sex   +poly(density,2) +poly(density,2)*geno_sex , data = glm_2011_data)
AIC(M2011_NB,  M2011_NB3)
BIC(M2011_NB,  M2011_NB3)
anova(M2011_NB,  M2011_NB3)
#drop1(M2011_NB2, test = "Chisq")

No, just use a linear effect

Now we’ll do model selection.

drop1(M2011_NB, test = "Chisq")
drop1(glm.nb(tlf ~ geno_sex  +density , data = glm_2011_data), test = "Chisq")

LRT suggests no significant effects.

summary(M2011_NB)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + density + density * geno_sex, 
##     data = glm_2011_data, init.theta = 0.268641156, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9500  -0.9491  -0.8304   0.1690   2.0947  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)        0.043927   0.685184   0.064    0.949
## geno_sexM          0.294642   0.898878   0.328    0.743
## density            0.001248   0.009260   0.135    0.893
## geno_sexM:density -0.009046   0.012564  -0.720    0.472
## 
## (Dispersion parameter for Negative Binomial(0.2686) family taken to be 1)
## 
##     Null deviance: 109.54  on 147  degrees of freedom
## Residual deviance: 108.11  on 144  degrees of freedom
## AIC: 386.33
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2686 
##           Std. Err.:  0.0592 
## 
##  2 x log-likelihood:  -376.3270
summary(glm.nb(tlf ~ geno_sex  +density , data = glm_2011_data))
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + density, data = glm_2011_data, 
##     init.theta = 0.266005344, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9843  -0.9235  -0.8535   0.1166   2.2631  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.412897   0.495808   0.833    0.405
## geno_sexM   -0.302581   0.361034  -0.838    0.402
## density     -0.004033   0.006260  -0.644    0.519
## 
## (Dispersion parameter for Negative Binomial(0.266) family taken to be 1)
## 
##     Null deviance: 108.91  on 147  degrees of freedom
## Residual deviance: 107.99  on 145  degrees of freedom
## AIC: 384.83
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2660 
##           Std. Err.:  0.0584 
## 
##  2 x log-likelihood:  -376.8280
summary(glm.nb(tlf ~ 1, data = glm_2011_data))
## 
## Call:
## glm.nb(formula = tlf ~ 1, data = glm_2011_data, init.theta = 0.2615848412, 
##     link = log)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.90881  -0.90881  -0.90881  -0.00306   2.26035  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.006734   0.180393   0.037     0.97
## 
## (Dispersion parameter for Negative Binomial(0.2616) family taken to be 1)
## 
##     Null deviance: 107.85  on 147  degrees of freedom
## Residual deviance: 107.85  on 147  degrees of freedom
## AIC: 381.74
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2616 
##           Std. Err.:  0.0571 
## 
##  2 x log-likelihood:  -377.7390

Wald tests agree

Summary There were no significant effects.

2012

glm_2012_data %>%
  group_by(jday, location) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday'. You can override using the `.groups`
## argument.
d2012 <- glm_2012_data %>%
  group_by(jday, location) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday'. You can override using the `.groups`
## argument.
glm_2012_data %<>%
  left_join(d2012)
## Joining, by = c("location", "jday")
f <- glm_2012_data  %>% 
  filter(geno_sex == "F") %>%
  group_by(location, jday) %>% 
  summarise(n_female_rg = n()) 
## `summarise()` has grouped output by 'location'. You can override using the
## `.groups` argument.
glm_2012_data  %<>%
  left_join(f)
## Joining, by = c("location", "jday")
m <- glm_2012_data  %>% 
  filter(geno_sex == "M") %>%
  group_by(location, jday) %>% 
  summarise(n_male_rg = n()) 
## `summarise()` has grouped output by 'location'. You can override using the
## `.groups` argument.
glm_2012_data  %<>%
  left_join(m) %>%
  mutate(sex_ratio_rg = n_male_rg/n_female_rg) %>%
  mutate(sex_ratio_rg_l = log(sex_ratio_rg))
## Joining, by = c("location", "jday")
select(glm_2012_data, tlf, geno_sex, jday, location, density, sex_ratio_rg_l) %>%
  pairs(., lower.panel = panel.cor, diag.panel = panel.hist, upper.panel = panel.smooth2)

str(glm_2012_data)
## tibble [258 × 10] (S3: tbl_df/tbl/data.frame)
##  $ date          : Factor w/ 10 levels "2012-06-01","2012-06-14",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ geno_sex      : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ location      : Factor w/ 2 levels "breitenbush river",..: 2 2 2 2 2 2 2 2 2 1 ...
##  $ tlf           : num [1:258] 0 0 0 0 8 0 0 0 1 3 ...
##  $ jday          : num [1:258] 153 153 153 153 153 153 153 153 153 166 ...
##  $ density       : int [1:258] 25 25 25 25 25 25 25 25 25 46 ...
##  $ n_female_rg   : int [1:258] 16 16 16 16 16 16 16 16 16 26 ...
##  $ n_male_rg     : int [1:258] 9 9 9 9 9 9 9 9 9 20 ...
##  $ sex_ratio_rg  : num [1:258] 0.562 0.562 0.562 0.562 0.562 ...
##  $ sex_ratio_rg_l: num [1:258] -0.575 -0.575 -0.575 -0.575 -0.575 ...

As with 2011, we need to simplify our model to avoid collinearity here. There are 2 release sites, but one was used on only a single day. We’ll skip location to retain date, because location is so unbalanced.

M2012_NBx <- glm.nb(tlf ~ geno_sex  +density  +jday  +sex_ratio_rg_l + location, data = glm_2012_data)


M2012_NB <- glm.nb(tlf ~ geno_sex  +density  +jday  +sex_ratio_rg_l, data = glm_2012_data)

summary(M2012_NB)  
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + density + jday + sex_ratio_rg_l, 
##     data = glm_2012_data, init.theta = 0.3714406409, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1513  -1.0163  -0.8576   0.1353   1.9958  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     1.040529   1.513580   0.687   0.4918  
## geno_sexM       0.066039   0.240934   0.274   0.7840  
## density         0.016721   0.008957   1.867   0.0619 .
## jday           -0.007212   0.006650  -1.085   0.2781  
## sex_ratio_rg_l  0.589420   0.520472   1.132   0.2574  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3714) family taken to be 1)
## 
##     Null deviance: 224.47  on 257  degrees of freedom
## Residual deviance: 212.61  on 253  degrees of freedom
## AIC: 741.63
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3714 
##           Std. Err.:  0.0607 
## 
##  2 x log-likelihood:  -729.6330
vif(M2012_NB)
##       geno_sex        density           jday sex_ratio_rg_l 
##       1.017726       1.485605       1.338664       1.472471
vif(M2012_NBx)
##       geno_sex        density           jday sex_ratio_rg_l       location 
##       1.017819       1.489281       1.791856       1.485939       1.515559

Looks good.

M2012_NB <- glm.nb(tlf ~ geno_sex  +density +geno_sex*density +jday+ jday*geno_sex +sex_ratio_rg_l + sex_ratio_rg_l*geno_sex  , data = glm_2012_data)

summary(M2012_NB)  
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + density + geno_sex * density + 
##     jday + jday * geno_sex + sex_ratio_rg_l + sex_ratio_rg_l * 
##     geno_sex, data = glm_2012_data, init.theta = 0.3795880981, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2163  -1.0635  -0.8242   0.1721   2.0793  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)               2.756033   1.948547   1.414    0.157
## geno_sexM                -4.071477   3.066174  -1.328    0.184
## density                   0.005871   0.012237   0.480    0.631
## jday                     -0.013743   0.008521  -1.613    0.107
## sex_ratio_rg_l            0.971034   0.683609   1.420    0.155
## geno_sexM:density         0.022867   0.017913   1.277    0.202
## geno_sexM:jday            0.016334   0.013534   1.207    0.227
## geno_sexM:sex_ratio_rg_l -0.807505   1.054335  -0.766    0.444
## 
## (Dispersion parameter for Negative Binomial(0.3796) family taken to be 1)
## 
##     Null deviance: 227.22  on 257  degrees of freedom
## Residual deviance: 213.09  on 250  degrees of freedom
## AIC: 745.57
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3796 
##           Std. Err.:  0.0625 
## 
##  2 x log-likelihood:  -727.5740

Much better. Now let’s get to model selection.

First should we fit either jday, density, or sex ratio as non-linear?

M2012_NB_nlj <- glm.nb(tlf ~ geno_sex  +density +geno_sex*density +poly(jday,2)+poly(jday,2)*geno_sex + sex_ratio_rg_l +sex_ratio_rg_l*geno_sex , data = glm_2012_data)

M2012_NB_nld <- glm.nb(tlf ~ geno_sex  +poly(density,2) +geno_sex*poly(density,2) +jday +jday*geno_sex + sex_ratio_rg_l +sex_ratio_rg_l*geno_sex  , data = glm_2012_data)

M2012_NB_nldj <- glm.nb(tlf ~ geno_sex  +poly(density,2) +geno_sex*poly(density,2) +poly(jday,2) +poly(jday,2)*geno_sex + sex_ratio_rg_l +sex_ratio_rg_l*geno_sex  , data = glm_2012_data)

M2012_NB_nldjs <- glm.nb(tlf ~ geno_sex  +poly(density,2) +geno_sex*poly(density,2) +poly(jday,2) +poly(jday,2)*geno_sex + poly(sex_ratio_rg_l,2) + poly(sex_ratio_rg_l,2)*geno_sex  , data = glm_2012_data)

M2012_NB_nlds <- glm.nb(tlf ~ geno_sex  +poly(density,2) +geno_sex*poly(density,2) +jday +jday*geno_sex + poly(sex_ratio_rg_l,2) + poly(sex_ratio_rg_l,2)*geno_sex  , data = glm_2012_data)

M2012_NB_nljs <- glm.nb(tlf ~ geno_sex  +density +geno_sex*density +poly(jday,2) +poly(jday,2)*geno_sex + poly(sex_ratio_rg_l,2) + poly(sex_ratio_rg_l,2)*geno_sex  , data = glm_2012_data)


AIC(M2012_NB, M2012_NB_nld, M2012_NB_nlj, M2012_NB_nldj, M2012_NB_nldjs, M2012_NB_nljs)
BIC(M2012_NB, M2012_NB_nld, M2012_NB_nlj, M2012_NB_nldj, M2012_NB_nldjs, M2012_NB_nljs)
#anova(M2012_NB, M2012_NB_nld, M2012_NB_nlj, test = "Chisq")

Fit all as linear.

Now let’s move on to model selection.

drop1(M2012_NB, test = "Chisq")

Interaction terms not significant,let’s drop least sig interaction and refit with main effects

M2012_NB2 <- glm.nb(tlf ~ geno_sex  +density +jday+ sex_ratio_rg_l +geno_sex*density + geno_sex*jday , data = glm_2012_data)
drop1(M2012_NB2, test = "Chisq")
M2012_NB3 <- glm.nb(tlf ~ geno_sex  +density +jday+ sex_ratio_rg_l +geno_sex*density  , data = glm_2012_data)
drop1(M2012_NB3, test = "Chisq")
M2012_NB4 <- glm.nb(tlf ~ geno_sex  +density +jday+ sex_ratio_rg_l , data = glm_2012_data)
drop1(M2012_NB4, test = "Chisq")

LRT suggests no significant effects. Density is almost significant.

Let’s look by Wald.

summary(M2012_NB)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + density + geno_sex * density + 
##     jday + jday * geno_sex + sex_ratio_rg_l + sex_ratio_rg_l * 
##     geno_sex, data = glm_2012_data, init.theta = 0.3795880981, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2163  -1.0635  -0.8242   0.1721   2.0793  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)               2.756033   1.948547   1.414    0.157
## geno_sexM                -4.071477   3.066174  -1.328    0.184
## density                   0.005871   0.012237   0.480    0.631
## jday                     -0.013743   0.008521  -1.613    0.107
## sex_ratio_rg_l            0.971034   0.683609   1.420    0.155
## geno_sexM:density         0.022867   0.017913   1.277    0.202
## geno_sexM:jday            0.016334   0.013534   1.207    0.227
## geno_sexM:sex_ratio_rg_l -0.807505   1.054335  -0.766    0.444
## 
## (Dispersion parameter for Negative Binomial(0.3796) family taken to be 1)
## 
##     Null deviance: 227.22  on 257  degrees of freedom
## Residual deviance: 213.09  on 250  degrees of freedom
## AIC: 745.57
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3796 
##           Std. Err.:  0.0625 
## 
##  2 x log-likelihood:  -727.5740
summary(glm.nb(tlf ~ geno_sex +sex_ratio_rg_l +density +jday + jday*geno_sex + geno_sex*density , data = glm_2012_data))
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + sex_ratio_rg_l + density + 
##     jday + jday * geno_sex + geno_sex * density, data = glm_2012_data, 
##     init.theta = 0.3776691244, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2027  -1.0529  -0.8515   0.1818   2.0747  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)        2.122986   1.816618   1.169    0.243
## geno_sexM         -2.602686   2.573275  -1.011    0.312
## sex_ratio_rg_l     0.617989   0.523140   1.181    0.237
## density            0.009721   0.011430   0.850    0.395
## jday              -0.011625   0.008209  -1.416    0.157
## geno_sexM:jday     0.011063   0.012252   0.903    0.367
## geno_sexM:density  0.015362   0.015699   0.978    0.328
## 
## (Dispersion parameter for Negative Binomial(0.3777) family taken to be 1)
## 
##     Null deviance: 226.58  on 257  degrees of freedom
## Residual deviance: 213.12  on 251  degrees of freedom
## AIC: 744.19
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3777 
##           Std. Err.:  0.0621 
## 
##  2 x log-likelihood:  -728.1900
summary(glm.nb(tlf ~ geno_sex +sex_ratio_rg_l +density +jday + geno_sex*density , data = glm_2012_data))
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + sex_ratio_rg_l + density + 
##     jday + geno_sex * density, data = glm_2012_data, init.theta = 0.3741159588, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1866  -1.0287  -0.8448   0.1703   1.8477  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)        1.308113   1.546161   0.846    0.398
## geno_sexM         -0.360362   0.600587  -0.600    0.548
## sex_ratio_rg_l     0.634959   0.524219   1.211    0.226
## density            0.010974   0.011339   0.968    0.333
## jday              -0.007510   0.006644  -1.130    0.258
## geno_sexM:density  0.011704   0.015028   0.779    0.436
## 
## (Dispersion parameter for Negative Binomial(0.3741) family taken to be 1)
## 
##     Null deviance: 225.38  on 257  degrees of freedom
## Residual deviance: 212.85  on 252  degrees of freedom
## AIC: 743.03
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3741 
##           Std. Err.:  0.0613 
## 
##  2 x log-likelihood:  -729.0310
summary(glm.nb(tlf ~ geno_sex +sex_ratio_rg_l +density +jday , data = glm_2012_data))
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + sex_ratio_rg_l + density + 
##     jday, data = glm_2012_data, init.theta = 0.3714406409, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1513  -1.0163  -0.8576   0.1353   1.9958  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     1.040529   1.513580   0.687   0.4918  
## geno_sexM       0.066039   0.240934   0.274   0.7840  
## sex_ratio_rg_l  0.589420   0.520472   1.132   0.2574  
## density         0.016721   0.008957   1.867   0.0619 .
## jday           -0.007212   0.006650  -1.085   0.2781  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3714) family taken to be 1)
## 
##     Null deviance: 224.47  on 257  degrees of freedom
## Residual deviance: 212.61  on 253  degrees of freedom
## AIC: 741.63
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3714 
##           Std. Err.:  0.0607 
## 
##  2 x log-likelihood:  -729.6330
summary(glm.nb(tlf ~ sex_ratio_rg_l +density +jday , data = glm_2012_data))
## 
## Call:
## glm.nb(formula = tlf ~ sex_ratio_rg_l + density + jday, data = glm_2012_data, 
##     init.theta = 0.3711349901, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1432  -1.0252  -0.8652   0.1188   2.0328  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     1.026663   1.509511   0.680   0.4964  
## sex_ratio_rg_l  0.594252   0.517431   1.148   0.2508  
## density         0.017002   0.008959   1.898   0.0577 .
## jday           -0.007030   0.006652  -1.057   0.2906  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3711) family taken to be 1)
## 
##     Null deviance: 224.37  on 257  degrees of freedom
## Residual deviance: 212.58  on 254  degrees of freedom
## AIC: 739.71
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3711 
##           Std. Err.:  0.0606 
## 
##  2 x log-likelihood:  -729.7060
summary(glm.nb(tlf ~ sex_ratio_rg_l +density , data = glm_2012_data))
## 
## Call:
## glm.nb(formula = tlf ~ sex_ratio_rg_l + density, data = glm_2012_data, 
##     init.theta = 0.3675230203, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1618  -1.0253  -0.9168   0.1891   1.9239  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    -0.525575   0.372537  -1.411    0.158   
## sex_ratio_rg_l  0.327290   0.467445   0.700    0.484   
## density         0.021252   0.008014   2.652    0.008 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3675) family taken to be 1)
## 
##     Null deviance: 223.13  on 257  degrees of freedom
## Residual deviance: 212.54  on 255  degrees of freedom
## AIC: 738.82
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3675 
##           Std. Err.:  0.0599 
## 
##  2 x log-likelihood:  -730.8160
summary(glm.nb(tlf ~ density , data = glm_2012_data))
## 
## Call:
## glm.nb(formula = tlf ~ density, data = glm_2012_data, init.theta = 0.3653627056, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1650  -0.9861  -0.9344   0.1707   1.9870  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -0.694112   0.287224  -2.417  0.01567 * 
## density      0.023703   0.007287   3.253  0.00114 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3654) family taken to be 1)
## 
##     Null deviance: 222.39  on 257  degrees of freedom
## Residual deviance: 212.39  on 256  degrees of freedom
## AIC: 737.36
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3654 
##           Std. Err.:  0.0594 
## 
##  2 x log-likelihood:  -731.3600
M2012_NB_4 <- glm.nb(tlf ~ density , data = glm_2012_data)

Wald suggests only density

Now let’s examine the fit

plot(M2012_NB_4)

qqrplot(M2012_NB_4)

rootogram(M2012_NB_4)

simulateResiduals(M2012_NB_4, plot = TRUE)

## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
##  
## Scaled residual values: 0.5843308 0.3359276 0.07005003 0.4083199 0.98 0.1524355 0.233027 0.5143451 0.6828106 0.8049384 0.6570003 0.9770405 0.4715661 0.2153433 0.4249999 0.6221605 0.3321563 0.2853203 0.301245 0.5103441 ...

The fit looks excellent!

One point of concern was that the single release at Breitenbush had a somewhat high density, so density and location effects could be confounded. The leverage plot above would pick up on any single observations were driving the fit, but since there are many observations from Breitenbush this wouldn’t show up.

Let’s summarise the model.

M2012_NB_null <- glm.nb(tlf ~ 1  , data = glm_2012_data)
anova(M2012_NB_null, M2012_NB_4)
summary(M2012_NB_4)
## 
## Call:
## glm.nb(formula = tlf ~ density, data = glm_2012_data, init.theta = 0.3653627056, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1650  -0.9861  -0.9344   0.1707   1.9870  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -0.694112   0.287224  -2.417  0.01567 * 
## density      0.023703   0.007287   3.253  0.00114 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3654) family taken to be 1)
## 
##     Null deviance: 222.39  on 257  degrees of freedom
## Residual deviance: 212.39  on 256  degrees of freedom
## AIC: 737.36
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3654 
##           Std. Err.:  0.0594 
## 
##  2 x log-likelihood:  -731.3600

For the results text, let’s predict TLF at minimum, median and maximum density, we’ll also make a nice plot

pe_d_2012 <- as.data.frame(predictorEffect("density", M2012_NB_4))
pe_d_2012 %>%
  filter(density %in% c(min(glm_2012_data$density), median(glm_2012_data$density), max(glm_2012_data$density)))
actual_means <- glm_2012_data %>%
  group_by(density) %>%
  summarise(mean_tlf = mean(tlf))

ggplot(data = pe_d_2012, aes(x = (density), y = fit))+ 
  geom_line() +
  xlab("Release Group Density") +
  geom_smooth( aes(ymin = lower, ymax = upper), stat = "identity") +
  theme_bw()+ylab("TLF")+coord_cartesian(ylim = c(0, 3)) +
  geom_rug(data = glm_2012_data, aes(x = density, y = NULL)) + 
  geom_point(data = actual_means, aes(x = density, y = mean_tlf), alpha = 0.8, shape = 24, size = 2)+ geom_point(aes(x = 46, y = 1.3913043), color = "red", shape  = 24, size = 2)+ggtitle("2012 Predictors of Fitness - Density")

#  annotate(geom="text", x=45, y=1, label="Single Day Release\nat Breitenbush")

Figure Caption: Release group density vs predicted (line and 95% confidence interval) and mean empirical TLF (triangle). Red triangle highlights the observations made from salmon released at Breitenbush River on a single day.

Summary The final model fit the better signficantly better than a null model including only an intercept (p= 0.002, LRT). After model selection by testing the impact of individual predictors on fit with LRT and backward stepwise selection using Wald tests, only a single predictor was included in the final model, density. Density had a positive effect on fitness (0.024 +- 0.007, log scale).

Recalling that location and density are somewhat confounded and we didn’t model location, we should also caution that this could be a location effect.

2013

glm_2013_data %>%
  group_by(jday, location) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday'. You can override using the `.groups`
## argument.
d2013 <- glm_2013_data %>%
  group_by(jday, location) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday'. You can override using the `.groups`
## argument.
glm_2013_data %<>%
  left_join(d2013)
## Joining, by = c("location", "jday")
f <- glm_2013_data  %>% 
  filter(geno_sex == "F") %>%
  group_by(location, jday) %>% 
  summarise(n_female_rg = n()) 
## `summarise()` has grouped output by 'location'. You can override using the
## `.groups` argument.
glm_2013_data  %<>%
  left_join(f)
## Joining, by = c("location", "jday")
m <- glm_2013_data  %>% 
  filter(geno_sex == "M") %>%
  group_by(location, jday) %>% 
  summarise(n_male_rg = n()) 
## `summarise()` has grouped output by 'location'. You can override using the
## `.groups` argument.
glm_2013_data  %<>%
  left_join(m) %>%
  mutate(sex_ratio_rg = n_male_rg/n_female_rg,
         sex_ratio_rg_l = log(sex_ratio_rg))
## Joining, by = c("location", "jday")
select(glm_2013_data, tlf, geno_sex, jday, location, density, sex_ratio_rg_l) %>%
  pairs(., lower.panel = panel.cor, diag.panel = panel.hist, upper.panel = panel.smooth2)

str(glm_2013_data)
## tibble [1,115 × 10] (S3: tbl_df/tbl/data.frame)
##  $ date          : Factor w/ 23 levels "2013-06-10","2013-06-13",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ geno_sex      : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ location      : Factor w/ 3 levels "hoover","kanes",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ tlf           : num [1:1115] 1 0 1 0 0 0 0 0 0 0 ...
##  $ jday          : num [1:1115] 161 161 161 161 161 161 161 161 161 161 ...
##  $ density       : int [1:1115] 48 48 48 48 48 48 48 48 48 48 ...
##  $ n_female_rg   : int [1:1115] 23 23 23 23 23 23 23 23 23 23 ...
##  $ n_male_rg     : int [1:1115] 25 25 25 25 25 25 25 25 25 25 ...
##  $ sex_ratio_rg  : num [1:1115] 1.09 1.09 1.09 1.09 1.09 ...
##  $ sex_ratio_rg_l: num [1:1115] 0.0834 0.0834 0.0834 0.0834 0.0834 ...

Once again there are systematic relationships between some the variables. Of the three locations, the third was only used at later dates. Only one should remain in the model. We’ll choose the most informative by model selection, but need to remember during interpretation that these variables are confounded.

M2013_NB <- glm.nb(tlf ~ geno_sex+location  +density  +jday + sex_ratio_rg_l, data = glm_2013_data)

summary(M2013_NB)  
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + location + density + jday + 
##     sex_ratio_rg_l, data = glm_2013_data, init.theta = 0.3108679912, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7934  -0.7387  -0.6897  -0.6203   2.7986  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)     -0.877105   0.876155  -1.001    0.317
## geno_sexM       -0.231900   0.148834  -1.558    0.119
## locationkanes   -0.170947   0.178558  -0.957    0.338
## locationmongold -0.042219   0.379249  -0.111    0.911
## density         -0.002749   0.003854  -0.713    0.476
## jday             0.001451   0.004566   0.318    0.751
## sex_ratio_rg_l  -0.168175   0.216866  -0.775    0.438
## 
## (Dispersion parameter for Negative Binomial(0.3109) family taken to be 1)
## 
##     Null deviance: 661.52  on 1114  degrees of freedom
## Residual deviance: 655.99  on 1108  degrees of freedom
## AIC: 1780.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3109 
##           Std. Err.:  0.0398 
## 
##  2 x log-likelihood:  -1764.2870
vif(M2013_NB)
##                    GVIF Df GVIF^(1/(2*Df))
## geno_sex       1.050449  1        1.024914
## location       5.135619  2        1.505387
## density        1.617153  1        1.271673
## jday           4.370027  1        2.090461
## sex_ratio_rg_l 1.902507  1        1.379314

First, let’s get those VIFs down to something more acceptable. We’ll use AIC and LRTs to determine which variable is better to include.

M2013_NB2 <- glm.nb(tlf ~ geno_sex  +density +jday +sex_ratio_rg_l  , data = glm_2013_data)
M2013_NB3 <- glm.nb(tlf ~ geno_sex+location  +density  +sex_ratio_rg_l  , data = glm_2013_data)
AIC(M2013_NB2, M2013_NB3)
vif(M2013_NB2)
##       geno_sex        density           jday sex_ratio_rg_l 
##       1.050918       1.605860       1.273330       1.358543
vif(M2013_NB3)
##                    GVIF Df GVIF^(1/(2*Df))
## geno_sex       1.050272  1        1.024828
## location       1.483224  2        1.103575
## density        1.533484  1        1.238339
## sex_ratio_rg_l 1.480859  1        1.216905
lrtest(M2013_NB2, M2013_NB3)

The models without location and without date were equivalently informative, and neither demonstrated further collinearity problems.

Let’s fit with jday instead of location, for consistency, but note that this is somewhat arbitrary and that any effect of location that we find needs to be considered in light of the relationship between these variables.

Now let’s check to see if need to fit a non-linear effect of density before finalizing model selection.

M2013_NB4 <- glm.nb(tlf ~ geno_sex+jday  +poly(density,2) +geno_sex*poly(density,2)+sex_ratio_rg_l+sex_ratio_rg_l*geno_sex +geno_sex*jday  , data = glm_2013_data)

M2013_NB5 <- glm.nb(tlf ~ geno_sex+ density + jday +geno_sex*density +sex_ratio_rg_l+sex_ratio_rg_l*geno_sex +geno_sex*jday , data = glm_2013_data)

M2013_NB6 <- glm.nb(tlf ~ geno_sex+density +poly(jday,2) +geno_sex*density +sex_ratio_rg_l+sex_ratio_rg_l*geno_sex  +geno_sex*poly(jday,2) , data = glm_2013_data)

M2013_NB7 <- glm.nb(tlf ~ geno_sex+poly(density,2) +poly(jday,2) +geno_sex*poly(density,2) +sex_ratio_rg_l+sex_ratio_rg_l*geno_sex  +geno_sex*poly(jday,2) , data = glm_2013_data)

M2013_NB8 <- glm.nb(tlf ~ geno_sex+ density + jday +geno_sex*density +poly(sex_ratio_rg_l,2)+poly(sex_ratio_rg_l,2)*geno_sex +geno_sex*jday , data = glm_2013_data)

M2013_NB9 <- glm.nb(tlf ~ geno_sex+ poly(density,2) + jday +geno_sex*poly(density,2) +poly(sex_ratio_rg_l,2)+poly(sex_ratio_rg_l,2)*geno_sex +geno_sex*jday , data = glm_2013_data)

M2013_NB10 <- glm.nb(tlf ~ geno_sex+ density + poly(jday,2) +geno_sex*density +poly(sex_ratio_rg_l,2)+poly(sex_ratio_rg_l,2)*geno_sex +geno_sex*poly(jday,2) , data = glm_2013_data)

M2013_NB11 <- glm.nb(tlf ~ geno_sex+ poly(density,2) + poly(jday,2) +geno_sex*poly(density,2) +poly(sex_ratio_rg_l,2)+poly(sex_ratio_rg_l,2)*geno_sex +geno_sex*poly(jday,2) , data = glm_2013_data)

AIC( M2013_NB4, M2013_NB5, M2013_NB6,  M2013_NB7, M2013_NB8, M2013_NB9, M2013_NB10, M2013_NB11)
anova(M2013_NB4, M2013_NB5)

Including density as non-linear is a very marginally significant improvement (delta AIC = 2.3, LRT p-value = 0.03) Let’s fit it.

First let’s use the drop1 command (LRTs for each effect)

drop1(M2013_NB4, test = "Chisq")

Let’s refit without the interaction.

drop1((glm.nb(tlf ~ geno_sex+jday  +poly(density,2) +geno_sex*poly(density,2) + geno_sex*jday +sex_ratio_rg_l  , data = glm_2013_data)), test = "Chisq")
drop1((glm.nb(tlf ~ geno_sex+jday  +poly(density,2)  + geno_sex*jday +sex_ratio_rg_l  , data = glm_2013_data)), test = "Chisq")
drop1((glm.nb(tlf ~ geno_sex+jday  +poly(density,2)  +sex_ratio_rg  , data = glm_2013_data)), test = "Chisq")

Significant effects of release group sex ratio, density and day.

Now backward stepwise by Wald

summary(M2013_NB4)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + poly(density, 2) + geno_sex * 
##     poly(density, 2) + sex_ratio_rg_l + sex_ratio_rg_l * geno_sex + 
##     geno_sex * jday, data = glm_2013_data, init.theta = 0.3202710124, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8260  -0.7410  -0.6884  -0.5995   3.0488  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                 -1.821604   1.011845  -1.800   0.0718 .
## geno_sexM                   -0.996656   1.361827  -0.732   0.4643  
## jday                         0.005336   0.004926   1.083   0.2786  
## poly(density, 2)1           -6.289136   5.238265  -1.201   0.2299  
## poly(density, 2)2           -7.268503   5.484617  -1.325   0.1851  
## sex_ratio_rg_l              -0.517777   0.375113  -1.380   0.1675  
## geno_sexM:poly(density, 2)1 -3.032156   7.629774  -0.397   0.6911  
## geno_sexM:poly(density, 2)2 -5.537307   7.776647  -0.712   0.4764  
## geno_sexM:sex_ratio_rg_l    -0.102561   0.491693  -0.209   0.8348  
## geno_sexM:jday               0.003619   0.006581   0.550   0.5824  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3203) family taken to be 1)
## 
##     Null deviance: 670.16  on 1114  degrees of freedom
## Residual deviance: 657.86  on 1105  degrees of freedom
## AIC: 1779.7
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3203 
##           Std. Err.:  0.0414 
## 
##  2 x log-likelihood:  -1757.6540
summary(glm.nb(tlf ~ geno_sex+jday  +poly(density,2) +poly(density,2)*geno_sex +sex_ratio_rg_l + geno_sex*sex_ratio_rg_l , data = glm_2013_data))
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + poly(density, 2) + poly(density, 
##     2) * geno_sex + sex_ratio_rg_l + geno_sex * sex_ratio_rg_l, 
##     data = glm_2013_data, init.theta = 0.319697262, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8460  -0.7347  -0.6878  -0.6058   2.9734  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -2.245332   0.678894  -3.307 0.000942 ***
## geno_sexM                   -0.252726   0.188208  -1.343 0.179336    
## jday                         0.007415   0.003266   2.270 0.023206 *  
## poly(density, 2)1           -7.748730   4.489248  -1.726 0.084336 .  
## poly(density, 2)2           -8.891746   4.733349  -1.879 0.060308 .  
## sex_ratio_rg_l              -0.594477   0.349469  -1.701 0.088927 .  
## geno_sexM:poly(density, 2)1 -0.425359   5.820058  -0.073 0.941738    
## geno_sexM:poly(density, 2)2 -2.726011   5.836473  -0.467 0.640454    
## geno_sexM:sex_ratio_rg_l     0.018339   0.432917   0.042 0.966210    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3197) family taken to be 1)
## 
##     Null deviance: 669.64  on 1114  degrees of freedom
## Residual deviance: 657.65  on 1106  degrees of freedom
## AIC: 1778
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3197 
##           Std. Err.:  0.0413 
## 
##  2 x log-likelihood:  -1757.9570
summary(glm.nb(tlf ~ geno_sex+jday  +poly(density,2) +poly(density,2)*geno_sex +sex_ratio_rg_l  , data = glm_2013_data))
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + poly(density, 2) + poly(density, 
##     2) * geno_sex + sex_ratio_rg_l, data = glm_2013_data, init.theta = 0.3196919213, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8458  -0.7343  -0.6880  -0.6058   2.9717  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -2.245900   0.678519  -3.310 0.000933 ***
## geno_sexM                   -0.247799   0.148563  -1.668 0.095320 .  
## jday                         0.007407   0.003263   2.270 0.023205 *  
## poly(density, 2)1           -7.683663   4.216937  -1.822 0.068440 .  
## poly(density, 2)2           -8.814377   4.375359  -2.015 0.043952 *  
## sex_ratio_rg_l              -0.583994   0.242466  -2.409 0.016016 *  
## geno_sexM:poly(density, 2)1 -0.549675   5.048844  -0.109 0.913304    
## geno_sexM:poly(density, 2)2 -2.852223   5.036852  -0.566 0.571210    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3197) family taken to be 1)
## 
##     Null deviance: 669.63  on 1114  degrees of freedom
## Residual deviance: 657.65  on 1107  degrees of freedom
## AIC: 1776
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3197 
##           Std. Err.:  0.0413 
## 
##  2 x log-likelihood:  -1757.9590
summary(glm.nb(tlf ~ geno_sex+jday  +poly(density,2)  +sex_ratio_rg_l  , data = glm_2013_data))
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + poly(density, 2) + sex_ratio_rg_l, 
##     data = glm_2013_data, init.theta = 0.319215236, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8394  -0.7406  -0.6829  -0.6106   2.9692  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.228028   0.675977  -3.296 0.000981 ***
## geno_sexM         -0.243920   0.148067  -1.647 0.099485 .  
## jday               0.007337   0.003252   2.256 0.024064 *  
## poly(density, 2)1 -7.667283   3.719147  -2.062 0.039249 *  
## poly(density, 2)2 -9.948491   3.777787  -2.633 0.008453 ** 
## sex_ratio_rg_l    -0.581070   0.241247  -2.409 0.016013 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3192) family taken to be 1)
## 
##     Null deviance: 669.20  on 1114  degrees of freedom
## Residual deviance: 657.55  on 1109  degrees of freedom
## AIC: 1772.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3192 
##           Std. Err.:  0.0412 
## 
##  2 x log-likelihood:  -1758.2880
summary(glm.nb(tlf ~ jday  +poly(density,2)  +sex_ratio_rg_l  , data = glm_2013_data))
## 
## Call:
## glm.nb(formula = tlf ~ jday + poly(density, 2) + sex_ratio_rg_l, 
##     data = glm_2013_data, init.theta = 0.3155321175, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8047  -0.7300  -0.6984  -0.6030   2.8181  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.306088   0.673706  -3.423 0.000619 ***
## jday               0.007163   0.003258   2.199 0.027897 *  
## poly(density, 2)1 -7.575123   3.723968  -2.034 0.041936 *  
## poly(density, 2)2 -9.763415   3.780813  -2.582 0.009813 ** 
## sex_ratio_rg_l    -0.637605   0.239173  -2.666 0.007679 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3155) family taken to be 1)
## 
##     Null deviance: 665.83  on 1114  degrees of freedom
## Residual deviance: 656.94  on 1110  degrees of freedom
## AIC: 1773
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3155 
##           Std. Err.:  0.0406 
## 
##  2 x log-likelihood:  -1760.9760

Same result

glm2013_final <- (glm.nb(tlf ~ jday  +poly(density,2)  +sex_ratio_rg_l  , data = glm_2013_data))

Summary

In 2013 there was a significant effect of release day, release group sex ratio and density. Let’s plot the effects.

require(effects)

#let's not use the effect package plotting tools and just get the effects

#eff1 <- predictorEffect("sex_ratio_rg_l", final_model, focal.levels = seq(-0.4,0.7,by = 0.1))
eff_sr <- predictorEffect("sex_ratio_rg_l", glm2013_final)
eff_sr_df <- as.data.frame(eff_sr)

eff_sr_df %<>%
  mutate(sex_ratio = exp(sex_ratio_rg_l))

actual_means <- glm_2013_data %>%
  group_by( sex_ratio_rg) %>%
  summarise(mean_tlf = mean(tlf))

ggplot(data = eff_sr_df, aes(x = (sex_ratio), y = fit))+ 
  geom_line()+scale_x_continuous(trans = "log", breaks = c(0.5, 0.75, 1, 1.5, 2, 2.5, 3)) +
  xlab( bquote(atop("Sex Ratio",(N[male]/N[female])))) +
  geom_smooth( aes(ymin = lower, ymax = upper), stat = "identity") +
  theme_bw()+ylab("TLF") +
  geom_rug(data = glm_2013_data, aes(x = sex_ratio_rg, y = NULL, color = NULL)) + 
  geom_point(data = actual_means, aes(x = sex_ratio_rg, y = mean_tlf), alpha = 0.8, shape = 24, size = 2)+ggtitle("2013 Predictors of Fitness - Release Group Sex Ratio")+ylim(0, 1.5)

#density
eff_d <- predictorEffect("density", glm2013_final)
eff_d_df <- as.data.frame(eff_d)


actual_means <- glm_2013_data %>%
  group_by( density) %>%
  summarise(mean_tlf = mean(tlf))

ggplot(data = eff_d_df, aes(x = (density), y = fit))+ 
  geom_line() +
  xlab("Release Group Density") +
  geom_smooth( aes(ymin = lower, ymax = upper), stat = "identity") +
  theme_bw()+ylab("TLF") +
  geom_point(data = actual_means, aes(x = density, y = mean_tlf), alpha = 0.8, shape = 24, size = 2) +
  geom_rug(data = glm_2013_data, aes(x = density, y = NULL, color = NULL)) +
  ggtitle("2013 Predictors of Fitness - Density")+ylim(0, 1.5)

#day
eff_j <- predictorEffect("jday", glm2013_final)
eff_j_df <- as.data.frame(eff_j)


actual_means <- glm_2013_data %>%
  group_by( jday) %>%
  summarise(mean_tlf = mean(tlf))

ggplot(data = eff_j_df, aes(x = (jday), y = fit))+ 
  geom_line() +
  xlab("Julian Day of Release") +
  geom_smooth( aes(ymin = lower, ymax = upper), stat = "identity") +
  theme_bw()+ylab("TLF") +
  geom_point(data = actual_means, aes(x = jday, y = mean_tlf), alpha = 0.8, shape = 24, size = 2) + 
  geom_rug(data = glm_2013_data, aes(x = jday, y = NULL, color = NULL)) + 
  ggtitle("2013 Predictors of Fitness - Release Day")+ylim(0, 1.5)

2014

glm_2014_data %>%
  group_by(jday, location) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday'. You can override using the `.groups`
## argument.
d2014 <- glm_2014_data %>%
  group_by(jday, location) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday'. You can override using the `.groups`
## argument.
glm_2014_data %<>%
  left_join(d2014)
## Joining, by = c("location", "jday", "density")
f <- glm_2014_data  %>% 
  filter(geno_sex == "F") %>%
  group_by(location, jday) %>% 
  summarise(n_female_rg = n()) 
## `summarise()` has grouped output by 'location'. You can override using the
## `.groups` argument.
glm_2014_data  %<>%
  left_join(f)
## Joining, by = c("location", "jday")
m <- glm_2014_data  %>% 
  filter(geno_sex == "M") %>%
  group_by(location, jday) %>% 
  summarise(n_male_rg = n()) 
## `summarise()` has grouped output by 'location'. You can override using the
## `.groups` argument.
glm_2014_data  %<>%
  left_join(m) %>%
  mutate(sex_ratio_rg = n_male_rg/n_female_rg,
         sex_ratio_rg_l = log(sex_ratio_rg))
## Joining, by = c("location", "jday")
select(glm_2014_data, tlf, geno_sex, jday, location, density, sex_ratio_rg_l) %>%
  pairs(., lower.panel = panel.cor, diag.panel = panel.hist, upper.panel = panel.smooth2)

str(glm_2014_data)
## tibble [861 × 10] (S3: tbl_df/tbl/data.frame)
##  $ date          : Factor w/ 36 levels "2014-06-17","2014-06-19",..: 2 2 1 1 1 1 1 1 1 1 ...
##  $ geno_sex      : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ location      : Factor w/ 5 levels "breitenbush river",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ tlf           : num [1:861] 0 0 0 0 0 0 0 0 0 0 ...
##  $ jday          : num [1:861] 170 170 168 168 168 168 168 168 168 168 ...
##  $ density       : int [1:861] 4 4 68 68 68 68 68 68 68 68 ...
##  $ n_female_rg   : int [1:861] 2 2 31 31 31 31 31 31 31 31 ...
##  $ n_male_rg     : int [1:861] 2 2 37 37 37 37 37 37 37 37 ...
##  $ sex_ratio_rg  : num [1:861] 1 1 1.19 1.19 1.19 ...
##  $ sex_ratio_rg_l: num [1:861] 0 0 0.177 0.177 0.177 ...

Once again location and jday are confounded. There is a location that is used early in the outplanting schedule and then never again. We’ll take the same approach as in 2013, and exclude location, but will need to remember this when interpreting effects.

M2014_NB <- glm.nb(tlf ~ geno_sex+location  +density  +jday+ sex_ratio_rg_l , data = glm_2014_data)

summary(M2014_NB)  
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + location + density + jday + 
##     sex_ratio_rg_l, data = glm_2014_data, init.theta = 0.2259743693, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6113  -0.5613  -0.4712  -0.4556   2.9582  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)  
## (Intercept)           -0.796258   2.404611  -0.331   0.7405  
## geno_sexM             -0.551889   0.230860  -2.391   0.0168 *
## locationcoopers ridge -0.406395   1.037283  -0.392   0.6952  
## locationhoover        -0.282637   0.608213  -0.465   0.6421  
## locationhorn creek     0.018029   0.481641   0.037   0.9701  
## locationkanes         -0.243119   0.627392  -0.388   0.6984  
## density                0.003906   0.008124   0.481   0.6306  
## jday                  -0.002666   0.009878  -0.270   0.7873  
## sex_ratio_rg_l        -0.033615   0.185606  -0.181   0.8563  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.226) family taken to be 1)
## 
##     Null deviance: 341.37  on 860  degrees of freedom
## Residual deviance: 334.67  on 852  degrees of freedom
## AIC: 831.12
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2260 
##           Std. Err.:  0.0486 
## 
##  2 x log-likelihood:  -811.1220
vif(M2014_NB)
##                    GVIF Df GVIF^(1/(2*Df))
## geno_sex       1.075886  1        1.037249
## location       9.939363  4        1.332508
## density        1.879917  1        1.371100
## jday           6.627468  1        2.574387
## sex_ratio_rg_l 1.212764  1        1.101255
M2014_NBb <- glm.nb(tlf ~ geno_sex +density  +jday+ sex_ratio_rg_l , data = glm_2014_data)

summary(M2014_NBb)  
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + density + jday + sex_ratio_rg_l, 
##     data = glm_2014_data, init.theta = 0.2254567095, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5919  -0.5609  -0.4714  -0.4592   2.9029  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    -1.791521   1.010299  -1.773   0.0762 .
## geno_sexM      -0.546343   0.230953  -2.366   0.0180 *
## density         0.002526   0.006463   0.391   0.6959  
## jday            0.001424   0.004239   0.336   0.7370  
## sex_ratio_rg_l -0.027035   0.181372  -0.149   0.8815  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2255) family taken to be 1)
## 
##     Null deviance: 341.06  on 860  degrees of freedom
## Residual deviance: 334.75  on 856  degrees of freedom
## AIC: 823.5
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2255 
##           Std. Err.:  0.0485 
## 
##  2 x log-likelihood:  -811.5040
vif(M2014_NBb)
##       geno_sex        density           jday sex_ratio_rg_l 
##       1.076133       1.187413       1.225121       1.167325
M2014_NB2 <- glm.nb(tlf ~ geno_sex  +density  +jday + geno_sex*density + geno_sex*jday +sex_ratio_rg_l +sex_ratio_rg_l*geno_sex, data = glm_2014_data)
M2014_NB3 <- glm.nb(tlf ~ geno_sex+location  +density + geno_sex*density  +sex_ratio_rg_l +sex_ratio_rg_l*geno_sex , data = glm_2014_data)
AIC(M2014_NB2, M2014_NB3)
anova(M2014_NB2, M2014_NB3)

In this case, dropping location allows for a model that provides a better fit to the data.

Now let’s check to see if need to fit a non-linear effect of density, jday or sex ratio before finalizing the start of model selection.

M2014_NB4 <- glm.nb(tlf ~ geno_sex  +density  +poly(jday,2) + geno_sex*density + geno_sex*poly(jday,2)+sex_ratio_rg_l +sex_ratio_rg_l*geno_sex, data = glm_2014_data)
M2014_NB5 <- glm.nb(tlf ~ geno_sex  +poly(density,2)  +jday + geno_sex*poly(density,2) + geno_sex*jday+sex_ratio_rg_l +sex_ratio_rg_l*geno_sex, data = glm_2014_data)
M2014_NB6 <- glm.nb(tlf ~ geno_sex  +poly(density,2)  +poly(jday,2) + geno_sex*poly(density,2) + geno_sex*poly(jday,2)+sex_ratio_rg_l +sex_ratio_rg_l*geno_sex, data = glm_2014_data)
M2014_NB7 <- glm.nb(tlf ~ geno_sex  +poly(density,2)  +poly(jday,2) + geno_sex*poly(density,2) + geno_sex*poly(jday,2)+poly(sex_ratio_rg_l,2) +poly(sex_ratio_rg_l,2)*geno_sex, data = glm_2014_data)
M2014_NB8 <- glm.nb(tlf ~ geno_sex  +density  +poly(jday,2) + geno_sex*density + geno_sex*poly(jday,2)+poly(sex_ratio_rg_l,2) +poly(sex_ratio_rg_l,2)*geno_sex, data = glm_2014_data)
M2014_NB9 <- glm.nb(tlf ~ geno_sex  +poly(density,2)  +jday + geno_sex*poly(density,2) + geno_sex*jday+poly(sex_ratio_rg_l,2) +poly(sex_ratio_rg_l,2)*geno_sex, data = glm_2014_data)
M2014_NB10 <- glm.nb(tlf ~ geno_sex  +density  +jday + geno_sex*density + geno_sex*jday+poly(sex_ratio_rg_l,2) +poly(sex_ratio_rg_l,2)*geno_sex, data = glm_2014_data)


AIC(M2014_NB2,M2014_NB4, M2014_NB5, M2014_NB6, M2014_NB7, M2014_NB8, M2014_NB9, M2014_NB10)
BIC(M2014_NB2,M2014_NB4, M2014_NB5, M2014_NB6, M2014_NB7, M2014_NB8, M2014_NB9, M2014_NB10)
anova(M2014_NB2, M2014_NB4, M2014_NB5, M2014_NB6)
anova(M2014_NB2, M2014_NB5)

Fitting density as a quadratic only provides a marginal improvement to the data, all others make it worse. Will keep linear.

Let’s do model selection.

drop1(M2014_NB2, test ="Chisq")

Let’s refit without the insignficant interaction.

M2014_NB4 <- glm.nb(tlf ~ geno_sex  +density  +jday + sex_ratio_rg_l+ geno_sex*density + geno_sex*sex_ratio_rg_l , data = glm_2014_data)
drop1(M2014_NB4, test = "Chisq")
M2014_NB4 <- glm.nb(tlf ~ geno_sex  +density  +jday + sex_ratio_rg_l+ geno_sex*density  , data = glm_2014_data)
drop1(M2014_NB4, test = "Chisq")

LRT suggests a final model with sex, density and their interaction.

summary(glm.nb(tlf ~ geno_sex  +density  +jday + geno_sex*density + jday*geno_sex +sex_ratio_rg_l + sex_ratio_rg_l*geno_sex  , data = glm_2014_data))
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + density + jday + geno_sex * 
##     density + jday * geno_sex + sex_ratio_rg_l + sex_ratio_rg_l * 
##     geno_sex, data = glm_2014_data, init.theta = 0.2598100312, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7784  -0.5470  -0.4694  -0.3794   2.7763  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)   
## (Intercept)              -1.581577   1.446444  -1.093  0.27421   
## geno_sexM                -0.735068   2.031510  -0.362  0.71748   
## density                  -0.015999   0.009716  -1.647  0.09963 . 
## jday                      0.003627   0.006002   0.604  0.54558   
## sex_ratio_rg_l           -0.152869   0.265949  -0.575  0.56542   
## geno_sexM:density         0.040343   0.013517   2.985  0.00284 **
## geno_sexM:jday           -0.008586   0.008769  -0.979  0.32751   
## geno_sexM:sex_ratio_rg_l  0.693961   0.387086   1.793  0.07301 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2598) family taken to be 1)
## 
##     Null deviance: 360.13  on 860  degrees of freedom
## Residual deviance: 338.91  on 853  degrees of freedom
## AIC: 815.4
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2598 
##           Std. Err.:  0.0580 
## 
##  2 x log-likelihood:  -797.3950
summary(glm.nb(tlf ~ geno_sex  +density  +jday + geno_sex*density  +sex_ratio_rg_l + sex_ratio_rg_l*geno_sex  , data = glm_2014_data))
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + density + jday + geno_sex * 
##     density + sex_ratio_rg_l + sex_ratio_rg_l * geno_sex, data = glm_2014_data, 
##     init.theta = 0.2573021845, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7460  -0.5640  -0.4711  -0.4009   2.8008  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.6959730  1.0667501  -0.652 0.514128    
## geno_sexM                -2.6238904  0.6531312  -4.017 5.88e-05 ***
## density                  -0.0179310  0.0093435  -1.919 0.054973 .  
## jday                     -0.0001898  0.0043394  -0.044 0.965111    
## sex_ratio_rg_l           -0.1590076  0.2701357  -0.589 0.556115    
## geno_sexM:density         0.0439916  0.0129654   3.393 0.000691 ***
## geno_sexM:sex_ratio_rg_l  0.6107508  0.3793111   1.610 0.107363    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2573) family taken to be 1)
## 
##     Null deviance: 358.83  on 860  degrees of freedom
## Residual deviance: 338.59  on 854  degrees of freedom
## AIC: 814.31
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2573 
##           Std. Err.:  0.0573 
## 
##  2 x log-likelihood:  -798.3060
summary(glm.nb(tlf ~ geno_sex  +density  +jday + geno_sex*density  +sex_ratio_rg_l   , data = glm_2014_data))
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + density + jday + geno_sex * 
##     density + sex_ratio_rg_l, data = glm_2014_data, init.theta = 0.2517890275, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7498  -0.5424  -0.4631  -0.4145   3.1216  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -1.182363   1.022897  -1.156  0.24772    
## geno_sexM         -2.057935   0.526604  -3.908 9.31e-05 ***
## density           -0.018239   0.009472  -1.926  0.05416 .  
## jday               0.001424   0.004220   0.338  0.73571    
## sex_ratio_rg_l     0.166024   0.188587   0.880  0.37867    
## geno_sexM:density  0.039895   0.012586   3.170  0.00153 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2518) family taken to be 1)
## 
##     Null deviance: 355.91  on 860  degrees of freedom
## Residual deviance: 338.37  on 855  degrees of freedom
## AIC: 814.83
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2518 
##           Std. Err.:  0.0558 
## 
##  2 x log-likelihood:  -800.8290
summary(glm.nb(tlf ~ geno_sex  +density + geno_sex*density  +sex_ratio_rg_l   , data = glm_2014_data))
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + density + geno_sex * density + 
##     sex_ratio_rg_l, data = glm_2014_data, init.theta = 0.2514071821, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7503  -0.5423  -0.4618  -0.4160   3.1272  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.856083   0.361450  -2.368  0.01786 *  
## geno_sexM         -2.059606   0.524901  -3.924 8.72e-05 ***
## density           -0.019025   0.009168  -2.075  0.03796 *  
## sex_ratio_rg_l     0.178565   0.185424   0.963  0.33554    
## geno_sexM:density  0.039866   0.012540   3.179  0.00148 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2514) family taken to be 1)
## 
##     Null deviance: 355.71  on 860  degrees of freedom
## Residual deviance: 338.29  on 856  degrees of freedom
## AIC: 812.94
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2514 
##           Std. Err.:  0.0557 
## 
##  2 x log-likelihood:  -800.9390
summary(glm.nb(tlf ~ geno_sex  +density + geno_sex*density     , data = glm_2014_data))
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + density + geno_sex * density, 
##     data = glm_2014_data, init.theta = 0.2516354857, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7064  -0.5437  -0.4681  -0.4041   3.3700  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.796869   0.350443  -2.274 0.022972 *  
## geno_sexM         -1.842167   0.484981  -3.798 0.000146 ***
## density           -0.018344   0.009078  -2.021 0.043309 *  
## geno_sexM:density  0.036000   0.011962   3.009 0.002617 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2516) family taken to be 1)
## 
##     Null deviance: 355.83  on 860  degrees of freedom
## Residual deviance: 339.33  on 857  degrees of freedom
## AIC: 811.87
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2516 
##           Std. Err.:  0.0560 
## 
##  2 x log-likelihood:  -801.8670

Backward stepwise by Wald agrees.

Now let’s examine the final model fit.

M2014_NB5 <- glm.nb(tlf ~ geno_sex  +density + geno_sex*density , data = glm_2014_data)
plot(M2014_NB5)

qqrplot(M2014_NB5)

rootogram(M2014_NB5)

simulateResiduals(M2014_NB5, plot = TRUE)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
##  
## Scaled residual values: 0.5806527 0.7010329 0.7820945 0.8117405 0.640081 0.6660345 0.9188419 0.6013782 0.05713582 0.8355606 0.1391672 0.05938788 0.8368955 0.189083 0.5718918 0.1887942 0.8808443 0.8742652 0.6444631 0.07805731 ...

A much better looking fit. An outlier issue shows up with th e simulated residuals, but it is only marginally significant and difficult to see any effects of outliers in the leverage plot or qqplots.

Let’s summarise the final model and discuss the results:

summary(M2014_NB5)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + density + geno_sex * density, 
##     data = glm_2014_data, init.theta = 0.2516354857, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7064  -0.5437  -0.4681  -0.4041   3.3700  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.796869   0.350443  -2.274 0.022972 *  
## geno_sexM         -1.842167   0.484981  -3.798 0.000146 ***
## density           -0.018344   0.009078  -2.021 0.043309 *  
## geno_sexM:density  0.036000   0.011962   3.009 0.002617 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2516) family taken to be 1)
## 
##     Null deviance: 355.83  on 860  degrees of freedom
## Residual deviance: 339.33  on 857  degrees of freedom
## AIC: 811.87
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2516 
##           Std. Err.:  0.0560 
## 
##  2 x log-likelihood:  -801.8670

Summary / discussion

The final model includes a significant effect of sex, density and their interaction. Males have lower fitness than females overall. Density has a negative effect on fitness for females and a positive effect on males. The overall sex ratio is 292:569, female:male. I plot the estimated effects below (note: should spend more time learning the emmeans package before publishing the figures below!).

Some things jump out here to discuss later: density effects estimated form individual outplantings suggest that fish s

 #emmeans::emmip(M2014_NB5,  geno_sex ~ density, cov.reduce = range, type = "scale", CIs = TRUE)+ylab("predicted response in reponse scale \n (tlf, not log(tlf)")+theme_classic()
#ggplot(glm_2014_data)+geom_histogram(aes(x = density))+theme_classic()


#let's not use the effect package plotting tools and just get the effects

#eff1 <- predictorEffect("sex_ratio_rg_l", final_model, focal.levels = seq(-0.4,0.7,by = 0.1))
eff1 <- predictorEffect("density", M2014_NB5)
effdf <- as.data.frame(eff1)


actual_means <- glm_2014_data %>%
  group_by(density, geno_sex) %>%
  summarise(mean_tlf = mean(tlf))
## `summarise()` has grouped output by 'density'. You can override using the
## `.groups` argument.
ggplot(data = effdf, aes(x = (density), y = fit, color = geno_sex))+ 
  geom_line() +
  xlab("Density") +
  geom_smooth( aes(ymin = lower, ymax = upper, fill = geno_sex, colour = geno_sex), stat = "identity") +
  theme_bw()+ylab("TLF")+coord_cartesian(ylim = c(0, 0.9)) + scale_color_manual(labels = c("Female", "Male"), name = "Sex", values = c("#228833", "#AA3377")) + scale_fill_manual(labels = c("Female", "Male"), name = "Sex", values = c("#228833", "#AA3377")) +
  geom_rug(data = glm_2014_data, aes(x = density, y = NULL, color = NULL))+ggtitle("2014 Predictors of Fitness - Sex * Density Interaction")

ggplot(data = effdf, aes(x = (density), y = fit, color = geno_sex))+ 
  geom_line() +
  xlab("Density") +
  geom_smooth( aes(ymin = lower, ymax = upper, fill = geno_sex, colour = geno_sex), stat = "identity") +
  theme_bw()+ylab("TLF") + scale_color_manual(labels = c("Female", "Male"), name = "Sex", values = c("#228833", "#AA3377")) + scale_fill_manual(labels = c("Female", "Male"), name = "Sex", values = c("#228833", "#AA3377")) +
  #geom_rug(data = glm_2014_data, aes(x = density, y = NULL, color = NULL))  +
  geom_jitter(data = glm_2014_data, aes(x = density, y = tlf, color = geno_sex, alpha = 0.5))

2015

glm_2015_data %>%
  group_by(jday, location) %>%
  summarise(density = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'jday'. You can override using the `.groups`
## argument.
glm_2015_data %<>%
  mutate(location = case_when(location == "dry cr" ~ "dry creek",
                              TRUE ~ as.character(location))) %>%
  mutate(location = as.factor(location))

d2015 <- glm_2015_data %>%
  group_by(jday, location) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday'. You can override using the `.groups`
## argument.
glm_2015_data %<>%
  left_join(d2015)
## Joining, by = c("location", "jday")
f <- glm_2015_data  %>% 
  filter(geno_sex == "F") %>%
  group_by(location, jday) %>% 
  summarise(n_female_rg = n()) 
## `summarise()` has grouped output by 'location'. You can override using the
## `.groups` argument.
glm_2015_data  %<>%
  left_join(f)
## Joining, by = c("location", "jday")
m <- glm_2015_data  %>% 
  filter(geno_sex == "M") %>%
  group_by(location, jday) %>% 
  summarise(n_male_rg = n()) 
## `summarise()` has grouped output by 'location'. You can override using the
## `.groups` argument.
glm_2015_data  %<>%
  left_join(m) %>%
  mutate(sex_ratio_rg = n_male_rg/n_female_rg,
         sex_ratio_rg_l = log(sex_ratio_rg))
## Joining, by = c("location", "jday")
# there are 8 individuals from a release group with no males causing issues with NAs, let's eliminate these 8 individuals from the model

glm_2015_data  %<>%
  drop_na()

select(glm_2015_data, tlf, geno_sex, jday, location, density, sex_ratio_rg_l) %>%
  pairs(., lower.panel = panel.cor, diag.panel = panel.hist, upper.panel = panel.smooth2)

str(glm_2015_data)
## tibble [1,465 × 10] (S3: tbl_df/tbl/data.frame)
##  $ date          : Factor w/ 22 levels "2015-06-12","2015-06-16",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ geno_sex      : Factor w/ 2 levels "F","M": 1 2 1 1 1 1 1 1 1 1 ...
##  $ location      : Factor w/ 3 levels "breitenbush river",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ tlf           : num [1:1465] 0 2 0 0 0 0 0 0 0 0 ...
##  $ jday          : num [1:1465] 167 167 167 167 167 167 167 167 167 167 ...
##  $ density       : int [1:1465] 136 136 136 136 136 136 136 136 136 136 ...
##  $ n_female_rg   : int [1:1465] 70 70 70 70 70 70 70 70 70 70 ...
##  $ n_male_rg     : int [1:1465] 66 66 66 66 66 66 66 66 66 66 ...
##  $ sex_ratio_rg  : num [1:1465] 0.943 0.943 0.943 0.943 0.943 ...
##  $ sex_ratio_rg_l: num [1:1465] -0.0588 -0.0588 -0.0588 -0.0588 -0.0588 ...

Finally it looks like we could model location, there is limited collinearity between location and other predictors, but extremely unbalanced releases create a problem. There is a third release location with very few released individuals.

M2015_NB <- glm.nb(tlf ~ geno_sex+location  +density  +jday +sex_ratio_rg_l , data = glm_2015_data)
M2015_NB1 <- glm.nb(tlf ~ geno_sex+location  +density +sex_ratio_rg_l , data = glm_2015_data)
M2015_NB2 <- glm.nb(tlf ~ geno_sex+jday  +density +sex_ratio_rg_l , data = glm_2015_data)

#summary(M2015_NB)  
vif(M2015_NB)
##                    GVIF Df GVIF^(1/(2*Df))
## geno_sex       1.091251  1        1.044630
## location       4.547491  2        1.460303
## density        1.441949  1        1.200812
## jday           3.405624  1        1.845433
## sex_ratio_rg_l 1.512638  1        1.229893
vif(M2015_NB1)
##                    GVIF Df GVIF^(1/(2*Df))
## geno_sex       1.092600  1        1.045275
## location       1.334643  2        1.074834
## density        1.284297  1        1.133268
## sex_ratio_rg_l 1.290862  1        1.136161
vif(M2015_NB2)
##       geno_sex           jday        density sex_ratio_rg_l 
##       1.093813       1.000641       1.040138       1.133474

Reoving either jday or location fixes the multicollinearity. The extremely unbalanced nature of location suggests we should use jday, but recall that these variables will always be confounded.

Let’s look for non-linear density effects.

M2015_NB <- glm.nb(tlf ~ geno_sex+ jday + density + sex_ratio_rg_l +geno_sex*density + geno_sex*sex_ratio_rg_l + geno_sex*jday, data = glm_2015_data)

M2015_NB_nl_j <- glm.nb(tlf ~ geno_sex+ poly(jday,2) + density + sex_ratio_rg_l +geno_sex*density + geno_sex*sex_ratio_rg_l + geno_sex*poly(jday,2), data = glm_2015_data)

M2015_NB_nl_d <- glm.nb(tlf ~ geno_sex+ jday + poly(density,2) + sex_ratio_rg_l +geno_sex*poly(density,2) + geno_sex*sex_ratio_rg_l + geno_sex*jday, data = glm_2015_data)

M2015_NB_nl_s <- glm.nb(tlf ~ geno_sex+ jday + density + poly(sex_ratio_rg_l,2) +geno_sex*density + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*jday, data = glm_2015_data)

M2015_NB_nl_dj <- glm.nb(tlf ~ geno_sex+ poly(jday,2) + poly(density,2) + sex_ratio_rg_l +geno_sex*poly(density,2) + geno_sex*sex_ratio_rg_l + geno_sex*poly(jday,2), data = glm_2015_data)

M2015_NB_nl_sj <- glm.nb(tlf ~ geno_sex+ poly(jday,2) + density + poly(sex_ratio_rg_l,2) +geno_sex*density + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*poly(jday,2), data = glm_2015_data)

M2015_NB_nl_sd <- glm.nb(tlf ~ geno_sex+ jday + poly(density,2) + poly(sex_ratio_rg_l,2) +geno_sex*poly(density,2) + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*jday, data = glm_2015_data)

M2015_NB_nl_sdj <- glm.nb(tlf ~ geno_sex+poly(jday,2)  +poly(density,2) + poly(sex_ratio_rg_l,2) +geno_sex*poly(density,2)   + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*poly(jday,2), data = glm_2015_data)

AIC(M2015_NB,M2015_NB_nl_j, M2015_NB_nl_d, M2015_NB_nl_s, M2015_NB_nl_dj, M2015_NB_nl_sj, M2015_NB_nl_sd, M2015_NB_nl_sdj )
BIC(M2015_NB,M2015_NB_nl_j, M2015_NB_nl_d, M2015_NB_nl_s, M2015_NB_nl_dj, M2015_NB_nl_sj, M2015_NB_nl_sd, M2015_NB_nl_sdj )
anova(M2015_NB, M2015_NB_nl_s)

Non-linear for sex ratio is better than all other models by AIC but not BIC and signifcantly better than a purely linear effects model according to LRT (p = 0.009) Let’s use it.

drop1(M2015_NB_nl_s, test ="Chisq")

Interaction not significant, so let’s fit again with main effects

M2015_NB8 <- glm.nb(tlf ~ geno_sex+ jday + density + poly(sex_ratio_rg_l,2) +geno_sex*density + geno_sex*poly(sex_ratio_rg_l,2) , data = glm_2015_data)
drop1(M2015_NB8, test = "Chisq")

LRT suggests a final model with day, sex, density sex ratio and two interactions, sex ratio * sex and density * sex

Let’s do model selection with Wald

summary(M2015_NB_nl_s)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + density + poly(sex_ratio_rg_l, 
##     2) + geno_sex * density + geno_sex * poly(sex_ratio_rg_l, 
##     2) + geno_sex * jday, data = glm_2015_data, init.theta = 0.3080581008, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1091  -0.9165  -0.8036   0.0693   3.1796  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -2.083800   0.567508  -3.672 0.000241 ***
## geno_sexM                            1.098066   0.789878   1.390 0.164477    
## jday                                 0.010031   0.002460   4.078 4.55e-05 ***
## density                             -0.001624   0.001497  -1.084 0.278224    
## poly(sex_ratio_rg_l, 2)1             5.568957   3.080979   1.808 0.070680 .  
## poly(sex_ratio_rg_l, 2)2            -1.130184   3.263261  -0.346 0.729090    
## geno_sexM:density                   -0.003550   0.001966  -1.805 0.070998 .  
## geno_sexM:poly(sex_ratio_rg_l, 2)1  18.573790   9.091265   2.043 0.041049 *  
## geno_sexM:poly(sex_ratio_rg_l, 2)2 -18.799112   7.442570  -2.526 0.011541 *  
## geno_sexM:jday                      -0.005203   0.003489  -1.491 0.135886    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3081) family taken to be 1)
## 
##     Null deviance: 1103.9  on 1464  degrees of freedom
## Residual deviance: 1057.0  on 1455  degrees of freedom
## AIC: 3450
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3081 
##           Std. Err.:  0.0237 
## 
##  2 x log-likelihood:  -3428.0140
summary(M2015_NB8)
## 
## Call:
## glm.nb(formula = tlf ~ geno_sex + jday + density + poly(sex_ratio_rg_l, 
##     2) + geno_sex * density + geno_sex * poly(sex_ratio_rg_l, 
##     2), data = glm_2015_data, init.theta = 0.3064454873, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0913  -0.9168  -0.8106   0.0971   3.1988  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -1.543702   0.421908  -3.659 0.000253 ***
## geno_sexM                           -0.008039   0.235566  -0.034 0.972775    
## jday                                 0.007529   0.001747   4.310 1.63e-05 ***
## density                             -0.001522   0.001501  -1.014 0.310488    
## poly(sex_ratio_rg_l, 2)1             5.275986   3.086383   1.709 0.087369 .  
## poly(sex_ratio_rg_l, 2)2            -0.678458   3.233867  -0.210 0.833825    
## geno_sexM:density                   -0.003902   0.001964  -1.986 0.046982 *  
## geno_sexM:poly(sex_ratio_rg_l, 2)1  19.602868   9.034762   2.170 0.030028 *  
## geno_sexM:poly(sex_ratio_rg_l, 2)2 -21.362181   7.272647  -2.937 0.003310 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3064) family taken to be 1)
## 
##     Null deviance: 1100.7  on 1464  degrees of freedom
## Residual deviance: 1056.2  on 1456  degrees of freedom
## AIC: 3450.2
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3064 
##           Std. Err.:  0.0235 
## 
##  2 x log-likelihood:  -3430.2220

Stepwise and single term deletions model selections agree

plot(M2015_NB8)

qqrplot(M2015_NB8)

rootogram(M2015_NB8)

simulateResiduals(M2015_NB8, plot = TRUE)

## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
##  
## Scaled residual values: 0.7536809 0.9206472 0.2592578 0.1784607 0.3411401 0.04792029 0.3777576 0.5278848 0.214797 0.03338875 0.2580567 0.8106493 0.8962189 0.3335934 0.653365 0.7628274 0.02049518 0.6664229 0.630657 0.5584901 ...

Summary

This one has a lot of significant effects. Two interaction terms: sex and density, as well as sex and sex ratio, jday is also in there.

pe_j_2015 <- as.data.frame(predictorEffect("jday", M2015_NB8))

actual_means <- glm_2015_data %>%
  group_by(jday) %>%
  summarise(mean_tlf = mean(tlf))

ggplot(data = pe_j_2015, aes(x = (jday), y = fit))+ 
  geom_line() +
  xlab("Julian Day of Release") +
  geom_smooth( aes(ymin = lower, ymax = upper), stat = "identity") +
  theme_bw()+ylab("TLF")+coord_cartesian(ylim = c(0, 3)) +
  geom_rug(data = glm_2015_data, aes(x = jday, y = NULL)) + 
  geom_point(data = actual_means, aes(x = jday, y = mean_tlf), alpha = 0.8, shape = 24, size = 2)+ggtitle("2015 Predictors of Fitness - Release Day")+ylim(0,3)

eff1 <- predictorEffect("density", M2015_NB8)
effdf <- as.data.frame(eff1)


actual_means <- glm_2015_data %>%
  group_by(density, geno_sex) %>%
  summarise(mean_tlf = mean(tlf))
## `summarise()` has grouped output by 'density'. You can override using the
## `.groups` argument.
ggplot(data = effdf, aes(x = (density), y = fit, color = geno_sex))+ 
  geom_line() +
  xlab("Density") +
  geom_smooth( aes(ymin = lower, ymax = upper, fill = geno_sex, colour = geno_sex), stat = "identity") +
  theme_bw()+ylab("TLF")+coord_cartesian(ylim = c(0, 3)) + scale_color_manual(labels = c("Female", "Male"), name = "Sex", values = c("#228833", "#AA3377")) + scale_fill_manual(labels = c("Female", "Male"), name = "Sex", values = c("#228833", "#AA3377")) +
  geom_rug(data = glm_2015_data, aes(x = density, y = NULL, color = NULL))+ggtitle("2015 Predictors of Fitness - Sex * Density Interaction")

# sex ratio
eff1 <- predictorEffect("sex_ratio_rg_l", M2015_NB8)
effdf <- as.data.frame(eff1)

effdf %<>%
  mutate(sex_ratio = exp(sex_ratio_rg_l))


ggplot(data = effdf, aes(x = (sex_ratio), y = fit, color = geno_sex))+ 
  geom_line()+scale_x_continuous(trans = "log", breaks = c(1/10, 1/5, 1/2, 1, 2, 4, 6)) +
  xlab( bquote(atop("Release Group Sex Ratio",(N[male]/N[female])))) +
  geom_smooth( aes(ymin = lower, ymax = upper, fill = geno_sex, colour = geno_sex), stat = "identity") +
  theme_bw()+ylab("TLF")+coord_cartesian(ylim = c(0, 3)) +  scale_color_manual(labels = c("Female", "Male"), name = "Sex", values = c("#228833", "#AA3377")) + scale_fill_manual(labels = c("Female", "Male"), name = "Sex", values = c("#228833", "#AA3377")) +
  geom_rug(data = glm_2015_data, aes(x = sex_ratio_rg, y = NULL, color = NULL)) + ggtitle("2015 Predictors of Fitness\nSex * Sex Ratio Interaction")

  #geom_point(data = actual_means, aes(x = sex_ratio_rg, y = mean_tlf, color = geno_sex), alpha = 0.8, shape = 24, size = 2)

glm stray thoughts

  • density effects are a property of individual outplanting events. one interpration of this is that this suggests the fish from different outplanting evens do not move through the basin and mix much. they tend to spawn with each other. but what about if it has to do with survival until spawning? if so why is there an interaction with sex.

  • sex ratios, the density results suggest that we might be interested in sex ratios. shold we examine variation in sex ratios at the level of individual outplanting events, or for the entire year at a time. if the former, then we can do so without mixed models.

Mixed Model

Here we explore alternative model specifications for predictors of lifetime fitness.

Compared to the approach taken previously I think we can fit a model that provides more power to predict signficant effects on fitness. I think date should be modeled as a continuous fixed variable (e.g. Julian day), we should add a variable - release group – that should be modeled as a random effect, and all years should be combined into the same model (year included as a second random effect).

I made a note of the rationale here below, in case we want to discuss in the future.

Fitting date as a fixed categorical effect effectively asks if each individual release date is different from others. i.e. did something happen ON THAT DAY that changed the fitness of released fish compared to other days. This may be important to know, but what can this tell us that will allow an improvement to management practices? Simply that some days are better than other. This is an interesting result, but I think we can say more and there is a more interesting way to ask the question. Including the effect of release date as a fixed, continuous variable is more appropriate if we are interested in evaluating the putative effect on fitness of environmental variables that vary over time, but then we are ignoring the batchiness of releases.

There are also day-to-day release batch effects, of course, but these are better fit as random effects because we are more interested in their contribution to variance than their fixed effect per se (e.g. we assume release batch effects are part of population of unsampled batch effects that contribute to variance). A good way to think about this is that if we added infinite years of data, there would be infinitely more levels of these effects, suggesting this is best modeled as a random effect, but eventually we’d saturate the julian day of release variable. As the models are currently specified we do not evaluate the contribution to variance of the batchiness of release day. We just model a linear relationship between julian day and fitness.

A mixed model also allows us to combine years and gain power. By fitting just one model across all years, we increase power to estimate the effects of variables because levels the variables take are repeated across multiple years. (e.g. if releases into the resevoir are somehow reducing fitness, why are we trying to figure this out looking one year at a time, when there are releases there every year). Here we are not interested in year over year trends in fitness or the contributions of specific years to fitness as much as we are interested in the variation due to year and evaluating the effects of release date, sex and release location after accounting for annual variation. This suggests we should fit year as a random effect. Fitting year as a fixed effect asks a different question and perhaps we should evaluate this as well.

Year level fixed effects are also likely to have a large influence on fitness. However, there are many unmeasured variables that vary across years. From the available data, the annual level variables with the most biologically interpretable hypothetical effects are number of outplants and sex ratios.

Finally, I don’t think there’s any good reason to assume a linear relationship between Julian day or the number of fish released (density) and fitness. We should expect stabilizing selection, suggesting that we should evaluate a quadratic effect of Julian day and/or density (after scaling and centering) on fitness.

Mixed Modeling Overview

We primarily follow the approach described in Zuur et al 2009. Mixed Effects Models and Extensions in Ecology with R.

  1. Exploratory Data Analysis: First we explore the relationships between all variables in the dataset.
  2. Multicollinearity: In addition to our findings from EDA above, we fit main effects models calculate variance inflation factors to identify multicollinearity among main effects.
  3. Random Effects Model Selection: After removing effects contributing to multicollinearity in a main effects only model, we fit an otherwise saturated models including all main effects and interactions, but varied which random effects were included. Model fit was by REML. We chose the best random effects structure by AIC and LRTs.
  4. Fixed Effects Model Selection: We conducted model selection on the fixed effects using LRTs and backward stepwise selection on Wald tests for signficant effects (p < 0.05 not multiple comparison corrected). Model fit was by ML. For the LRTs we stepwise dropped non-significant interaction terms from the model, to evaluate if main effects were significant.
  5. Model Validation: We validated the model using simulated residuals from the DHARMa package.
  6. Estimated Marginal Means and Effect: To improve biological intepretation of signficant predictors of fitness, we estimated marginal means for significant effects.

Data and EDA

Let’s prepare the data. We need several additional variables (release group, julian day, density for each release group, total number of fish released in a year, annual sex ratios).

mm_data <-  parents %>%
  filter(type %in% c("outplant", "reintro_above"), year <2016) %>%
  select(date, geno_sex, location, tlf, year) %>%
  drop_na() %>%
  mutate(jday = as.numeric(format(date, "%j"))) %>% #julian day in this case: days since the first day of the year
  mutate(location = case_when(location == "dry cr" ~ "dry creek",
                              TRUE ~ as.character(location))) %>%
  mutate(location = as.factor(location)) %>%
  mutate(jday_c = scale(jday, scale = F),#center the julian day to help with convergence
         geno_sex = as.factor(geno_sex),
         location = as.factor(location),
         year = as.factor(year),
         group = as.factor(paste(date, location)))


# lets add density
dens <- mm_data %>%
 group_by(jday, location, year) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday', 'location'. You can override using
## the `.groups` argument.
mm_data %<>%
  left_join(dens)
## Joining, by = c("location", "year", "jday")
# lets add overall size of release in a year
dens <- mm_data %>%
 group_by( year) %>%
  summarise(annual_n = n())

mm_data %<>%
  left_join(dens)
## Joining, by = "year"
# lets add sex ratio
#build release group sex ratio variable
f <- mm_data %>% 
  filter(geno_sex == "F") %>%
  group_by(location, jday, year) %>% 
  summarise(n_female_rg = n()) 
## `summarise()` has grouped output by 'location', 'jday'. You can override using
## the `.groups` argument.
mm_data %<>%
  left_join(f)
## Joining, by = c("location", "year", "jday")
m <- mm_data %>% 
  filter(geno_sex == "M") %>%
  group_by(location, jday, year) %>% 
  summarise(n_male_rg = n()) 
## `summarise()` has grouped output by 'location', 'jday'. You can override using
## the `.groups` argument.
mm_data %<>%
  left_join(m) %>%
  mutate(sex_ratio_rg = n_male_rg/n_female_rg)
## Joining, by = c("location", "year", "jday")
# maybe the release groups all mix thoroughly spawn together and we should fit sex ratio at the level of year
f <- mm_data %>% 
  filter(geno_sex == "F") %>%
  group_by( year) %>% 
  summarise(n_female_y = n()) 

mm_data %<>%
  left_join(f)
## Joining, by = "year"
m <- mm_data %>% 
  filter(geno_sex == "M") %>%
  group_by( year) %>% 
  summarise(n_male_y = n()) 

mm_data %<>%
  left_join(m) %>%
  mutate(sex_ratio_y = n_male_y/n_female_y)
## Joining, by = "year"
mm_data %<>%
  mutate(sex_ratio_rg_l = log(sex_ratio_rg),
         sex_ratio_y_l = log(sex_ratio_y))

mm_data %<>%
  drop_na()

Let’s explore the data a bit too and look for issues with collinearity

select(mm_data, tlf, geno_sex, jday, density,sex_ratio_y_l, sex_ratio_rg_l, annual_n , location) %>%
  pairs(., lower.panel = panel.cor, diag.panel = panel.hist, upper.panel = panel.smooth2)

beyond_opt_main <- glm.nb(tlf ~ jday_c + geno_sex + location  + density+sex_ratio_y_l + sex_ratio_rg_l + annual_n, data = mm_data)
vif(beyond_opt_main)
##                    GVIF Df GVIF^(1/(2*Df))
## jday_c         2.212875  1        1.487574
## geno_sex       1.077381  1        1.037970
## location       7.292641  7        1.152483
## density        1.581151  1        1.257438
## sex_ratio_y_l  2.171821  1        1.473710
## sex_ratio_rg_l 1.379888  1        1.174686
## annual_n       2.820263  1        1.679364

Location demonstrates strong multicollinearity . This isn’t surprising given our analyses within years. Let’s remove and look ant VIFs again.

beyond_opt_main <- glm.nb(tlf ~ jday_c + geno_sex + density+sex_ratio_y_l + sex_ratio_rg_l + annual_n, data = mm_data)
vif(beyond_opt_main)
##         jday_c       geno_sex        density  sex_ratio_y_l sex_ratio_rg_l 
##       1.052163       1.079292       1.325110       1.368598       1.255625 
##       annual_n 
##       1.437828

As a summary so far: location demonstrated collinearity with release date in some years. Any significant effect of release date needs to be considered in light of this confounding. We also considered two year-level fixed effects, total number of outplants in a year and the overall sex ratio among outplants in a year. These were only slightly collinear, but we should remember that any year level effects are confounded by all the other variables, considered here or otherwise, that vary between years and could be driving signifcant effects.

Model Selection

Now lets fit the beyond optimal model. This has more fixed effects than we could possibly want in our final model, but that is desirable for finding the optimum random effect structure. We should also fit all fixed effects that could reasonably be non-linear using polynomials, as the linear model is nested within the polynomial and therefore the polynomial is better as the satured/beyond optimal.

In the annual GLMs, we saw that extreme values of sex ratio or density produced lower fitness. So we should definitely try to fit at least these two variables as non-linear. Day never had anything resembling a non-linear effect. For annual sex ratio’s we only have observations across 5 years and no extreme values, so we should probably fit as linear, but it is worth exploring. We’ll include these three as non-linear effects to start.

mm_beyond_opt <- glmmTMB(tlf ~ jday_c + geno_sex  +  poly(density,2)+ annual_n + geno_sex*poly(density,2)+poly(sex_ratio_y_l,2)+ poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_y_l,2) + (1|group) + (1|year), data = mm_data, family = nbinom2, REML = TRUE)

mm_beyond_opt2 <- glmmTMB(tlf ~ jday_c + geno_sex  +  poly(density,2)+ annual_n + geno_sex*poly(density,2)+poly(sex_ratio_y_l,2)+ poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_y_l,2)  +(1|year), data = mm_data, family = nbinom2,  REML = TRUE)

mm_beyond_opt3 <- glmmTMB(tlf ~ jday_c + geno_sex  +  poly(density,2)+ annual_n + geno_sex*poly(density,2)+poly(sex_ratio_y_l,2)+ poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_y_l,2) + (1|group), data = mm_data, family = nbinom2,  REML = TRUE)

mm_beyond_opt4 <- glmmTMB(tlf ~ jday_c + geno_sex  +  poly(density,2)+ annual_n + geno_sex*poly(density,2)+poly(sex_ratio_y_l,2)+ poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_y_l,2) , data = mm_data, family = nbinom2,  REML = TRUE)



AIC(mm_beyond_opt, mm_beyond_opt2, mm_beyond_opt3, mm_beyond_opt4)
BIC(mm_beyond_opt, mm_beyond_opt2, mm_beyond_opt3, mm_beyond_opt4)

lrtest(mm_beyond_opt4, mm_beyond_opt)
lrtest(mm_beyond_opt4, mm_beyond_opt2)
lrtest(mm_beyond_opt4, mm_beyond_opt3)
summary(mm_beyond_opt)

AIC and BIC agree the best fit to the data is the full random effects model including both release group and year.

Ultimately the correct random effects structure is a question about the inferences we’d like to make and AIC is only one piece of information to help us understand the releaitonship between parsimony and model fit.

For example, so many of our variables are calculated at the level of release group, and including release group as a random effect means these variables will be competing with random intercepts for release group. Same goes for year.

In the case of year we also have a problem that we are trying to estimate a variance form only 5 levels of the variable of interest. The generally held threshold for the levels required is at least five, so we’re marginally okay here. However, the generally accepted cutoff for estimating the standard error of these variance estimates is 8, so we should restrict ourselves from discussing this result if we choose to use year as a random effect.

Since we are interested in evaluating the significance of predictors primarily, I’d prefer to be conservative and include them as random effectes. Failing to include year and release group as random effects implies that we are uninterested in the correlation within groups. Inferentially, this implies that the fixed effects at the level of release group or year (release group: sex ratio, density and year: n and sex ratio) are the only effects that might lead to correlation within the levels, and we freely pool across all levels of release group or year and attribute all variance to fixed effects.

We will include random effects for year and release group.

Let’s refit the model using ML.

mm_beyond_opt <- glmmTMB(tlf ~ jday_c + geno_sex  +  poly(density,2)+ annual_n + geno_sex*poly(density,2)+poly(sex_ratio_y_l,2)+ poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_y_l,2) + (1|group) + (1|year), data = mm_data, family = nbinom2)

summary(mm_beyond_opt)
##  Family: nbinom2  ( log )
## Formula:          
## tlf ~ jday_c + geno_sex + poly(density, 2) + annual_n + geno_sex *  
##     poly(density, 2) + poly(sex_ratio_y_l, 2) + poly(sex_ratio_rg_l,  
##     2) + geno_sex * poly(sex_ratio_rg_l, 2) + geno_sex * poly(sex_ratio_y_l,  
##     2) + (1 | group) + (1 | year)
## Data: mm_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   7187.8   7306.6  -3574.9   7149.8     3828 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  group  (Intercept) 0.05878  0.2425  
##  year   (Intercept) 0.04581  0.2140  
## Number of obs: 3847, groups:  group, 96; year, 5
## 
## Dispersion parameter for nbinom2 family (): 0.306 
## 
## Conditional model:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -7.313e-01  3.637e-01  -2.011 0.044349 *  
## jday_c                              3.288e-03  1.734e-03   1.895 0.058033 .  
## geno_sexM                          -3.175e-01  8.550e-02  -3.714 0.000204 ***
## poly(density, 2)1                  -5.043e+00  5.255e+00  -0.960 0.337221    
## poly(density, 2)2                   1.792e+00  4.534e+00   0.395 0.692721    
## annual_n                            1.218e-04  3.401e-04   0.358 0.720185    
## poly(sex_ratio_y_l, 2)1            -3.238e+01  7.345e+00  -4.409 1.04e-05 ***
## poly(sex_ratio_y_l, 2)2            -8.885e+00  8.373e+00  -1.061 0.288635    
## poly(sex_ratio_rg_l, 2)1           -7.018e-01  5.252e+00  -0.134 0.893697    
## poly(sex_ratio_rg_l, 2)2           -5.281e+00  4.716e+00  -1.120 0.262746    
## geno_sexM:poly(density, 2)1        -2.994e+00  5.287e+00  -0.566 0.571191    
## geno_sexM:poly(density, 2)2        -7.872e+00  5.141e+00  -1.531 0.125679    
## geno_sexM:poly(sex_ratio_rg_l, 2)1  1.225e+01  7.520e+00   1.629 0.103234    
## geno_sexM:poly(sex_ratio_rg_l, 2)2 -6.148e+00  7.635e+00  -0.805 0.420719    
## geno_sexM:poly(sex_ratio_y_l, 2)1  -1.234e+01  6.045e+00  -2.041 0.041276 *  
## geno_sexM:poly(sex_ratio_y_l, 2)2   1.821e+00  5.413e+00   0.336 0.736591    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Now that we have the random structure figured out, let’s do model selection on the fixed effects.

First should we fit density and/or jday as non-linear

beyond_opt_lin <- glmmTMB(tlf ~ jday_c + geno_sex  +  density + annual_n + sex_ratio_y_l + sex_ratio_rg_l + geno_sex*density + geno_sex*sex_ratio_rg_l + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2)

beyond_opt_nl_s <- glmmTMB(tlf ~ jday_c + geno_sex  +  density + annual_n + sex_ratio_y_l + poly(sex_ratio_rg_l,2) + geno_sex*density + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2)

beyond_opt_nl_d <-glmmTMB(tlf ~ jday_c + geno_sex  +  poly(density,2) + annual_n + sex_ratio_y_l + sex_ratio_rg_l + geno_sex*poly(density,2) + geno_sex*sex_ratio_rg_l + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2)

beyond_opt_nl_y <- glmmTMB(tlf ~ jday_c + geno_sex  +  density + annual_n + poly(sex_ratio_y_l,2) + sex_ratio_rg_l + geno_sex*density + geno_sex*sex_ratio_rg_l + geno_sex*poly(sex_ratio_y_l,2) + (1|group) + (1|year), data = mm_data, family = nbinom2)
  
beyond_opt_nl_yd <- glmmTMB(tlf ~ jday_c + geno_sex  +  poly(density,2) + annual_n + poly(sex_ratio_y_l,2) + sex_ratio_rg_l + geno_sex*poly(density,2) + geno_sex*sex_ratio_rg_l + geno_sex*poly(sex_ratio_y_l,2) + (1|group) + (1|year), data = mm_data, family = nbinom2)

beyond_opt_nl_sy <- glmmTMB(tlf ~ jday_c + geno_sex  +  density + annual_n + poly(sex_ratio_y_l,2) + poly(sex_ratio_rg_l,2) + geno_sex*density + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_y_l,2) + (1|group) + (1|year), data = mm_data, family = nbinom2)

beyond_opt_nl_ds <-glmmTMB(tlf ~ jday_c + geno_sex  +  poly(density,2) + annual_n + sex_ratio_y_l + poly(sex_ratio_rg_l,2) + geno_sex*poly(density,2) + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2)

beyond_opt_nl_dys <-glmmTMB(tlf ~ jday_c + geno_sex  +  poly(density,2) + annual_n + poly(sex_ratio_y_l,2) + poly(sex_ratio_rg_l,2) + geno_sex*poly(density,2) + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_y_l,2) + (1|group) + (1|year), data = mm_data, family = nbinom2)


  
AIC(beyond_opt_nl_s, beyond_opt_nl_d, beyond_opt_nl_y,beyond_opt_nl_yd ,beyond_opt_nl_sy , beyond_opt_nl_ds, beyond_opt_nl_dys, beyond_opt_lin)
BIC(beyond_opt_nl_s, beyond_opt_nl_d, beyond_opt_nl_y,beyond_opt_nl_yd ,beyond_opt_nl_sy , beyond_opt_nl_ds, beyond_opt_nl_dys, beyond_opt_lin)
anova(beyond_opt_nl_d, beyond_opt_lin)
anova(beyond_opt_nl_ds, beyond_opt_lin)

Fitting release group density as a polynomial appears best by AIC, but improvement is small. LRT suggests not significant. Fitting both release group density and sex ratio is a VERY slight improvement to the data over a purely linear model. Annual non-linear sex ratio effects are not worht the cost in df.

This fits wiht our a priori expectations from EDA. Both release group varaibles should be non-linear relationship over the range of densities and sex ratios used at the release group level. Whereas with annual sex ratio, only extreme male-biased sex ratios are used, so we can get a more parsimonius model with a linear rather than quadratic effect. Although the delta AIC is not big, let’s fit the more complex model. We can always work towards more parsimony with further model selection. We’ll keep non-linear effect of release group density and sex ratio.

drop1(beyond_opt_nl_ds, test = "Chisq")

Let’s get rid of the worst interaction term and fit again

drop1(glmmTMB(tlf ~ jday_c + geno_sex  + density + annual_n + sex_ratio_y_l + poly(sex_ratio_rg_l,2) + geno_sex*density  + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2), test = "Chisq")

Still some non-significant interactions.

drop1(glmmTMB(tlf ~ jday_c + geno_sex  +  density + annual_n + sex_ratio_y_l + sex_ratio_rg_l + geno_sex*density  + geno_sex*sex_ratio_rg_l + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2), test = "Chisq")
drop1(glmmTMB(tlf ~ jday_c + geno_sex  + density + annual_n + sex_ratio_y_l + sex_ratio_rg_l + geno_sex*density   + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2), test = "Chisq")
drop1(glmmTMB(tlf ~ jday_c + geno_sex  + density + annual_n + sex_ratio_y_l + sex_ratio_rg_l    + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2), test = "Chisq")

LRT suggests only sex, annual sex ratio and their interaction are signficant. Now Let’s do backward stepwise using Wald Tests.

summary(beyond_opt_nl_ds)
##  Family: nbinom2  ( log )
## Formula:          
## tlf ~ jday_c + geno_sex + poly(density, 2) + annual_n + sex_ratio_y_l +  
##     poly(sex_ratio_rg_l, 2) + geno_sex * poly(density, 2) + geno_sex *  
##     poly(sex_ratio_rg_l, 2) + geno_sex * sex_ratio_y_l + (1 |  
##     group) + (1 | year)
## Data: mm_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   7184.9   7291.3  -3575.5   7150.9     3830 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  group  (Intercept) 0.05895  0.2428  
##  year   (Intercept) 0.04902  0.2214  
## Number of obs: 3847, groups:  group, 96; year, 5
## 
## Dispersion parameter for nbinom2 family (): 0.306 
## 
## Conditional model:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -0.3365024  0.2595560  -1.296   0.1948    
## jday_c                              0.0034271  0.0017302   1.981   0.0476 *  
## geno_sexM                          -0.0598978  0.1271991  -0.471   0.6377    
## poly(density, 2)1                  -4.2732733  5.0793464  -0.841   0.4002    
## poly(density, 2)2                   0.9147192  4.4239326   0.207   0.8362    
## annual_n                            0.0003213  0.0002788   1.153   0.2491    
## sex_ratio_y_l                      -2.0752305  0.4912886  -4.224  2.4e-05 ***
## poly(sex_ratio_rg_l, 2)1           -0.5432576  5.2372229  -0.104   0.9174    
## poly(sex_ratio_rg_l, 2)2           -5.0882662  4.6768636  -1.088   0.2766    
## geno_sexM:poly(density, 2)1        -3.8731545  4.8313588  -0.802   0.4227    
## geno_sexM:poly(density, 2)2        -7.3007049  4.9766183  -1.467   0.1424    
## geno_sexM:poly(sex_ratio_rg_l, 2)1 12.2454422  7.4720543   1.639   0.1012    
## geno_sexM:poly(sex_ratio_rg_l, 2)2 -6.5480529  7.6263118  -0.859   0.3906    
## geno_sexM:sex_ratio_y_l            -0.8895110  0.3860034  -2.304   0.0212 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beyond_opt_nl_d_1 <- glmmTMB(tlf ~ jday_c + geno_sex  +  poly(density,2) + annual_n + sex_ratio_y_l + sex_ratio_rg_l + geno_sex*poly(density,2)  + geno_sex*sex_ratio_rg_l + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2)

summary(beyond_opt_nl_d_1)
##  Family: nbinom2  ( log )
## Formula:          
## tlf ~ jday_c + geno_sex + poly(density, 2) + annual_n + sex_ratio_y_l +  
##     sex_ratio_rg_l + geno_sex * poly(density, 2) + geno_sex *  
##     sex_ratio_rg_l + geno_sex * sex_ratio_y_l + (1 | group) +      (1 | year)
## Data: mm_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   7185.6   7279.5  -3577.8   7155.6     3832 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  group  (Intercept) 0.07351  0.2711  
##  year   (Intercept) 0.04480  0.2117  
## Number of obs: 3847, groups:  group, 96; year, 5
## 
## Dispersion parameter for nbinom2 family (): 0.305 
## 
## Conditional model:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -0.3352201  0.2523505  -1.328   0.1840    
## jday_c                       0.0026533  0.0017794   1.491   0.1359    
## geno_sexM                   -0.0975262  0.1160346  -0.840   0.4006    
## poly(density, 2)1           -3.0883507  5.2649533  -0.587   0.5575    
## poly(density, 2)2           -0.2486335  4.4899457  -0.055   0.9558    
## annual_n                     0.0003280  0.0002748   1.194   0.2326    
## sex_ratio_y_l               -2.1256370  0.4787927  -4.440 9.01e-06 ***
## sex_ratio_rg_l               0.0447376  0.1062082   0.421   0.6736    
## geno_sexM:poly(density, 2)1 -2.7635940  4.7660074  -0.580   0.5620    
## geno_sexM:poly(density, 2)2 -7.1270042  4.9143030  -1.450   0.1470    
## geno_sexM:sex_ratio_rg_l     0.1141917  0.1615146   0.707   0.4796    
## geno_sexM:sex_ratio_y_l     -0.8376182  0.3848907  -2.176   0.0295 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beyond_opt_nl_d_2 <- glmmTMB(tlf ~ jday_c + geno_sex  +  annual_n+ poly(density,2) + sex_ratio_y_l + sex_ratio_rg_l + geno_sex*poly(density,2)  + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2)

summary(beyond_opt_nl_d_2)
##  Family: nbinom2  ( log )
## Formula:          
## tlf ~ jday_c + geno_sex + annual_n + poly(density, 2) + sex_ratio_y_l +  
##     sex_ratio_rg_l + geno_sex * poly(density, 2) + geno_sex *  
##     sex_ratio_y_l + (1 | group) + (1 | year)
## Data: mm_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   7184.1   7271.7  -3578.1   7156.1     3833 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  group  (Intercept) 0.06962  0.2639  
##  year   (Intercept) 0.04701  0.2168  
## Number of obs: 3847, groups:  group, 96; year, 5
## 
## Dispersion parameter for nbinom2 family (): 0.305 
## 
## Conditional model:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -0.329367   0.253742  -1.298   0.1943    
## jday_c                       0.002877   0.001731   1.661   0.0966 .  
## geno_sexM                   -0.092025   0.115955  -0.794   0.4274    
## annual_n                     0.000329   0.000278   1.184   0.2365    
## poly(density, 2)1           -3.639825   5.147220  -0.707   0.4795    
## poly(density, 2)2           -0.434227   4.441332  -0.098   0.9221    
## sex_ratio_y_l               -2.168215   0.481021  -4.508 6.56e-06 ***
## sex_ratio_rg_l               0.088754   0.085013   1.044   0.2965    
## geno_sexM:poly(density, 2)1 -2.695795   4.769861  -0.565   0.5720    
## geno_sexM:poly(density, 2)2 -6.148975   4.715421  -1.304   0.1922    
## geno_sexM:sex_ratio_y_l     -0.726163   0.350866  -2.070   0.0385 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beyond_opt_nl_d_3 <- glmmTMB(tlf ~  geno_sex  +  density +annual_n+ sex_ratio_y_l + sex_ratio_rg_l + geno_sex*density  + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2)

summary(beyond_opt_nl_d_3)
##  Family: nbinom2  ( log )
## Formula:          
## tlf ~ geno_sex + density + annual_n + sex_ratio_y_l + sex_ratio_rg_l +  
##     geno_sex * density + geno_sex * sex_ratio_y_l + (1 | group) +  
##     (1 | year)
## Data: mm_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   7182.9   7251.7  -3580.5   7160.9     3836 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  group  (Intercept) 0.08274  0.2876  
##  year   (Intercept) 0.05035  0.2244  
## Number of obs: 3847, groups:  group, 96; year, 5
## 
## Dispersion parameter for nbinom2 family (): 0.305 
## 
## Conditional model:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.3415979  0.2739969  -1.247    0.212    
## geno_sexM                0.0001470  0.1659561   0.001    0.999    
## density                 -0.0006692  0.0016469  -0.406    0.684    
## annual_n                 0.0003650  0.0002889   1.263    0.206    
## sex_ratio_y_l           -2.0961314  0.4929829  -4.252 2.12e-05 ***
## sex_ratio_rg_l           0.0620671  0.0850120   0.730    0.465    
## geno_sexM:density       -0.0012160  0.0014831  -0.820    0.412    
## geno_sexM:sex_ratio_y_l -0.7408594  0.3513058  -2.109    0.035 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beyond_opt_nl_d_4 <- glmmTMB(tlf ~  geno_sex  +  density + annual_n+ sex_ratio_y_l + sex_ratio_rg_l   + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2)

summary(beyond_opt_nl_d_4)
##  Family: nbinom2  ( log )
## Formula:          
## tlf ~ geno_sex + density + annual_n + sex_ratio_y_l + sex_ratio_rg_l +  
##     geno_sex * sex_ratio_y_l + (1 | group) + (1 | year)
## Data: mm_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   7181.6   7244.2  -3580.8   7161.6     3837 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  group  (Intercept) 0.08394  0.2897  
##  year   (Intercept) 0.05135  0.2266  
## Number of obs: 3847, groups:  group, 96; year, 5
## 
## Dispersion parameter for nbinom2 family (): 0.305 
## 
## Conditional model:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.2913036  0.2691429  -1.082   0.2791    
## geno_sexM               -0.0985418  0.1143347  -0.862   0.3888    
## density                 -0.0013466  0.0014288  -0.943   0.3459    
## annual_n                 0.0003668  0.0002912   1.259   0.2079    
## sex_ratio_y_l           -2.1140060  0.4962656  -4.260 2.05e-05 ***
## sex_ratio_rg_l           0.0710728  0.0845232   0.841   0.4004    
## geno_sexM:sex_ratio_y_l -0.7120134  0.3493821  -2.038   0.0416 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beyond_opt_nl_d_5 <- glmmTMB(tlf ~  geno_sex  +  density +  sex_ratio_y_l  + annual_n   + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2)

summary(beyond_opt_nl_d_5)
##  Family: nbinom2  ( log )
## Formula:          
## tlf ~ geno_sex + density + sex_ratio_y_l + annual_n + geno_sex *  
##     sex_ratio_y_l + (1 | group) + (1 | year)
## Data: mm_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   7180.3   7236.6  -3581.1   7162.3     3838 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  group  (Intercept) 0.08808  0.2968  
##  year   (Intercept) 0.05106  0.2260  
## Number of obs: 3847, groups:  group, 96; year, 5
## 
## Dispersion parameter for nbinom2 family (): 0.305 
## 
## Conditional model:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.2975269  0.2700294  -1.102   0.2705    
## geno_sexM               -0.0896168  0.1139013  -0.787   0.4314    
## density                 -0.0013191  0.0014471  -0.912   0.3620    
## sex_ratio_y_l           -2.0483587  0.4899908  -4.180 2.91e-05 ***
## annual_n                 0.0003647  0.0002919   1.250   0.2115    
## geno_sexM:sex_ratio_y_l -0.6915039  0.3487087  -1.983   0.0474 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beyond_opt_nl_d_6<- glmmTMB(tlf ~  geno_sex  + sex_ratio_y_l +annual_n  + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2)

summary(beyond_opt_nl_d_6)
##  Family: nbinom2  ( log )
## Formula:          
## tlf ~ geno_sex + sex_ratio_y_l + annual_n + geno_sex * sex_ratio_y_l +  
##     (1 | group) + (1 | year)
## Data: mm_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   7179.1   7229.1  -3581.5   7163.1     3839 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  group  (Intercept) 0.09695  0.3114  
##  year   (Intercept) 0.04242  0.2060  
## Number of obs: 3847, groups:  group, 96; year, 5
## 
## Dispersion parameter for nbinom2 family (): 0.306 
## 
## Conditional model:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.3581870  0.2537430  -1.412   0.1581    
## geno_sexM               -0.0891440  0.1140128  -0.782   0.4343    
## sex_ratio_y_l           -2.0201153  0.4636701  -4.357 1.32e-05 ***
## annual_n                 0.0003414  0.0002761   1.236   0.2163    
## geno_sexM:sex_ratio_y_l -0.6925567  0.3491555  -1.984   0.0473 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beyond_opt_nl_d_7<- glmmTMB(tlf ~  geno_sex  +  sex_ratio_y_l +  geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2)

summary(beyond_opt_nl_d_7)
##  Family: nbinom2  ( log )
## Formula:          
## tlf ~ geno_sex + sex_ratio_y_l + geno_sex * sex_ratio_y_l + (1 |  
##     group) + (1 | year)
## Data: mm_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   7178.3   7222.0  -3582.1   7164.3     3840 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  group  (Intercept) 0.09449  0.3074  
##  year   (Intercept) 0.07343  0.2710  
## Number of obs: 3847, groups:  group, 96; year, 5
## 
## Dispersion parameter for nbinom2 family (): 0.306 
## 
## Conditional model:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.11614    0.17812  -0.652 0.514369    
## geno_sexM               -0.08811    0.11409  -0.772 0.439922    
## sex_ratio_y_l           -1.75411    0.48904  -3.587 0.000335 ***
## geno_sexM:sex_ratio_y_l -0.69517    0.34903  -1.992 0.046401 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
final_model<- glmmTMB(tlf ~  geno_sex  +  sex_ratio_y_l  + geno_sex*sex_ratio_y_l + (1|group) + (1|year), data = mm_data, family = nbinom2)

summary(final_model)
##  Family: nbinom2  ( log )
## Formula:          
## tlf ~ geno_sex + sex_ratio_y_l + geno_sex * sex_ratio_y_l + (1 |  
##     group) + (1 | year)
## Data: mm_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   7178.3   7222.0  -3582.1   7164.3     3840 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  group  (Intercept) 0.09449  0.3074  
##  year   (Intercept) 0.07343  0.2710  
## Number of obs: 3847, groups:  group, 96; year, 5
## 
## Dispersion parameter for nbinom2 family (): 0.306 
## 
## Conditional model:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.11614    0.17812  -0.652 0.514369    
## geno_sexM               -0.08811    0.11409  -0.772 0.439922    
## sex_ratio_y_l           -1.75411    0.48904  -3.587 0.000335 ***
## geno_sexM:sex_ratio_y_l -0.69517    0.34903  -1.992 0.046401 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Variable selection procedures agree. sex, sex ratio (annual) and their interaction is significant.

Model validation

#final_model <- glmmTMB(tlf ~  geno_sex  +sex_ratio_y_l + geno_sex*sex_ratio_y_l + (1|group) +  (1|year),  data = mm_data, family = nbinom2)
#plot(final_model)
#qqrplot(final_model)
#rootogram(final_model)
simulateResiduals(final_model, plot = TRUE)

## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
##  
## Scaled residual values: 0.4224933 0.6201168 0.1983307 0.5588989 0.179909 0.8125378 0.09290893 0.3052929 0.8028638 0.5425511 0.518053 0.9573324 0.915071 0.8029668 0.837693 0.912789 0.1080119 0.3007513 0.4787766 0.01528406 ...
summary(final_model)
##  Family: nbinom2  ( log )
## Formula:          
## tlf ~ geno_sex + sex_ratio_y_l + geno_sex * sex_ratio_y_l + (1 |  
##     group) + (1 | year)
## Data: mm_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   7178.3   7222.0  -3582.1   7164.3     3840 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  group  (Intercept) 0.09449  0.3074  
##  year   (Intercept) 0.07343  0.2710  
## Number of obs: 3847, groups:  group, 96; year, 5
## 
## Dispersion parameter for nbinom2 family (): 0.306 
## 
## Conditional model:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.11614    0.17812  -0.652 0.514369    
## geno_sexM               -0.08811    0.11409  -0.772 0.439922    
## sex_ratio_y_l           -1.75411    0.48904  -3.587 0.000335 ***
## geno_sexM:sex_ratio_y_l -0.69517    0.34903  -1.992 0.046401 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model fits excellently! The qq plot of simulated residuals shows a good fit, there is no evidence of overdispersion, and outliers don’t seem to drive the fit. We also don’t see any pattern to the residuals.

Since we have so few levels of year, it is also useful to make sure we didn’t get a “singular” fit, but the random effect estimates are reasonable (if not a little large)

Effects

final_sum <- summary(final_model)

Random Effects

The among year standard deviation in TLF was 0.324 (log scale), and the among release group standard deviation was 0.283 (log scale)

Interaction

Since the interaction is significant, let’s plot and examine the effects of sex and sex ratio together.

require(effects)

#let's not use the effect package plotting tools and just get the effects

#eff1 <- predictorEffect("sex_ratio_y_l", final_model, focal.levels = seq(-0.4,0.7,by = 0.1))
eff1 <- predictorEffect("sex_ratio_y_l", final_model)
## Warning in Effect.glmmTMB(ans, mod, x.var = 1, xlevels = xlevels, ...):
## overriding variance function for effects: computed variances may be incorrect
effdf <- as.data.frame(eff1)

effdf %<>%
  mutate(sex_ratio = exp(sex_ratio_y_l))

actual_means <- mm_data %>%
  group_by(year, geno_sex, sex_ratio_y) %>%
  summarise(mean_tlf = mean(tlf))
## `summarise()` has grouped output by 'year', 'geno_sex'. You can override using
## the `.groups` argument.
ggplot(data = effdf, aes(x = (sex_ratio), y = fit, color = geno_sex))+ 
  geom_line()+scale_x_continuous(trans = "log", n.breaks = 10) +
  xlab( bquote(atop("Sex Ratio",(N[male]/N[female])))) +
  geom_smooth( aes(ymin = lower, ymax = upper, fill = geno_sex, colour = geno_sex), stat = "identity") +
  theme_bw()+ylab("TLF")+coord_cartesian(ylim = c(0, 3)) + scale_color_manual(labels = c("Female", "Male"), name = "Sex", values = c("#228833", "#AA3377")) + scale_fill_manual(labels = c("Female", "Male"), name = "Sex", values = c("#228833", "#AA3377")) +
  geom_rug(data = mm_data, aes(x = sex_ratio_y, y = NULL, color = NULL)) + 
  geom_point(data = actual_means, aes(x = sex_ratio_y, y = mean_tlf, color = geno_sex), alpha = 0.8, shape = 24, size = 2)

The plot above shows the predicted TLF (lines and 95% confidence intervals) from the final mixed model fit, and the mean TLF from the empirical data (triangles) against sex ratio in a given year for both male and female Chinook salmon outplanted or reintroduced above Detroit dam from 2011 to 2015.

Other Types

We benefit form having somewhat indepedent data against which to test our predictions in the form of fish from below Big Cliff. Let’s quickly assess if there is a relationship between TLF and annual sex ratios for carcasses and reintros below.

mm_data_below <-  parents %>%
  filter(type %in% c("reintro"), year < 2017) %>%
  select(date, geno_sex, location, tlf, year, type) %>%
  mutate(jday = as.numeric(format(date, "%j"))) %>% #julian day in this case: days since the first day of the year
  mutate(location = case_when(location == "dry cr" ~ "dry creek",
                              TRUE ~ as.character(location))) %>%
  mutate(location = as.factor(location)) %>%
  mutate(jday_c = scale(jday, scale = F),#center the julian day to help with convergence
         geno_sex = as.factor(geno_sex),
         location = as.factor(location),
         year = as.factor(year),
         group = as.factor(paste(date, location)))


# lets add density
dens <- mm_data_below %>%
 group_by(jday, location, year) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday', 'location'. You can override using
## the `.groups` argument.
mm_data_below %<>%
  left_join(dens)
## Joining, by = c("location", "year", "jday")
# lets add overall size of release in a year
dens <- mm_data_below %>%
 group_by( year) %>%
  summarise(annual_n = n())

mm_data_below %<>%
  left_join(dens)
## Joining, by = "year"
# lets add sex ratio
#build release group sex ratio variable
f <- mm_data_below %>% 
  filter(geno_sex == "F") %>%
  group_by(location, jday, year) %>% 
  summarise(n_female_rg = n()) 
## `summarise()` has grouped output by 'location', 'jday'. You can override using
## the `.groups` argument.
mm_data_below %<>%
  left_join(f)
## Joining, by = c("location", "year", "jday")
m <- mm_data_below %>% 
  filter(geno_sex == "M") %>%
  group_by(location, jday, year) %>% 
  summarise(n_male_rg = n()) 
## `summarise()` has grouped output by 'location', 'jday'. You can override using
## the `.groups` argument.
mm_data_below %<>%
  left_join(m) %>%
  mutate(sex_ratio_rg = n_male_rg/n_female_rg)
## Joining, by = c("location", "year", "jday")
# maybe the release groups all mix thoroughly spawn together and we should fit sex ratio at the level of year
f <- mm_data_below %>% 
  filter(geno_sex == "F") %>%
  group_by( year) %>% 
  summarise(n_female_y = n()) 

mm_data_below %<>%
  left_join(f)
## Joining, by = "year"
m <- mm_data_below %>% 
  filter(geno_sex == "M") %>%
  group_by( year) %>% 
  summarise(n_male_y = n()) 

mm_data_below %<>%
  left_join(m) %>%
  mutate(sex_ratio_y = n_male_y/n_female_y)
## Joining, by = "year"
mm_data_below %<>%
  mutate(sex_ratio_rg_l = log(sex_ratio_rg),
         sex_ratio_y_l = log(sex_ratio_y))
below_tlf_means <- mm_data_below %>%
  group_by(geno_sex, sex_ratio_y_l, year) %>%
  summarise(mean_tlf = mean(tlf), sd_tlf = sd(tlf))
## `summarise()` has grouped output by 'geno_sex', 'sex_ratio_y_l'. You can
## override using the `.groups` argument.
ggplot(data = below_tlf_means)+geom_point(aes(sex_ratio_y_l, mean_tlf, color = geno_sex ))+geom_smooth(aes(sex_ratio_y_l, mean_tlf, color = geno_sex ), method = "lm")
## `geom_smooth()` using formula 'y ~ x'

below_glmm <- glmmTMB(tlf ~  geno_sex  + sex_ratio_y_l+ sex_ratio_y_l*geno_sex , data = mm_data_below, family=nbinom2)
summary(below_glmm)
##  Family: nbinom2  ( log )
## Formula:          tlf ~ geno_sex + sex_ratio_y_l + sex_ratio_y_l * geno_sex
## Data: mm_data_below
## 
##      AIC      BIC   logLik deviance df.resid 
##   2436.1   2464.1  -1213.1   2426.1     1971 
## 
## 
## Dispersion parameter for nbinom2 family (): 0.281 
## 
## Conditional model:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.6987     0.1843  -3.792 0.000149 ***
## geno_sexM                -0.2638     0.2522  -1.046 0.295678    
## sex_ratio_y_l            -0.7831     0.3606  -2.171 0.029897 *  
## geno_sexM:sex_ratio_y_l  -0.4372     0.4817  -0.908 0.364108    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
drop1(below_glmm, test = "Chisq")
#interaction not significant, remove and refit
below_glmm <- glmmTMB(tlf ~  geno_sex  + sex_ratio_y_l , data = mm_data_below, family=nbinom2)
summary(below_glmm)
##  Family: nbinom2  ( log )
## Formula:          tlf ~ geno_sex + sex_ratio_y_l
## Data: mm_data_below
## 
##      AIC      BIC   logLik deviance df.resid 
##   2434.9   2457.3  -1213.5   2426.9     1972 
## 
## 
## Dispersion parameter for nbinom2 family (): 0.28 
## 
## Conditional model:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -0.5901     0.1409  -4.187 2.83e-05 ***
## geno_sexM      -0.4630     0.1250  -3.703 0.000213 ***
## sex_ratio_y_l  -1.0290     0.2390  -4.305 1.67e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
drop1(below_glmm, test = "Chisq")
plot(predictorEffect("sex_ratio_y_l", below_glmm), main = "GLM: TLF ~ sex + annual sex ratio\nNOR Below Big Cliff",  axes=list(x=list(sex_ratio_y_l=list(lab="Log Sex ratio (males / females)"))))
## Warning in Effect.glmmTMB(ans, mod, x.var = 1, xlevels = xlevels, ...):
## overriding variance function for effects: computed variances may be incorrect

simulateResiduals(below_glmm, plot = TRUE)

## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
##  
## Scaled residual values: 0.7101547 0.183623 0.9866914 0.9831289 0.1633995 0.8388224 0.04729095 0.9327624 0.5937675 0.6165492 0.9630993 0.1271758 0.9624872 0.8547619 0.2597211 0.01486824 0.7720559 0.7978305 0.8594588 0.9221622 ...
effdf <- as.data.frame(predictorEffect("sex_ratio_y_l", below_glmm), main = "GLM: TLF ~ sex + annual sex ratio\nNOR Below Big Cliff",  axes=list(x=list(sex_ratio_y_l=list(lab="Log Sex ratio (males / females)"))))
## Warning in Effect.glmmTMB(ans, mod, x.var = 1, xlevels = xlevels, ...):
## overriding variance function for effects: computed variances may be incorrect
effdf %<>%
  mutate(sex_ratio_y = exp(sex_ratio_y_l))

below_tlf_means2 <- mm_data_below %>%
  group_by( sex_ratio_y_l, year) %>%
  summarise(mean_tlf = mean(tlf), sd_tlf = sd(tlf))
## `summarise()` has grouped output by 'sex_ratio_y_l'. You can override using the
## `.groups` argument.
ggplot(data = effdf, aes(x = (sex_ratio_y), y = fit)) + geom_line()+scale_x_continuous(trans = "log", n.breaks = 10) +
  xlab( bquote(atop("Sex Ratio",(N[male]/N[female])))) +
  geom_smooth( aes(ymin = lower, ymax = upper), stat = "identity") +
  theme_bw()+ylab("Fitness") + 
  geom_rug(data = below_tlf_means2, aes(x = exp(sex_ratio_y_l), y = NULL))

  #geom_point(data = below_tlf_means2, aes(x = exp(sex_ratio_y_l), y = mean_tlf), alpha = 0.8, shape = 24, size = 2)

Similar effect both above and below Detroit. Is this due to other variables nested within year? Let’s check if sex ratios were correlated?

a <- mm_data %>% count(sex_ratio_y, year)
b <- mm_data_below %>% count(sex_ratio_y, year) 

cor <- bind_rows(a,b, .id = "above_below") %>%
  mutate(above_below = case_when(above_below == "1" ~ "above",
                                 above_below == "2" ~ "below"))

ggplot(data = cor)+geom_point(aes(x = year, y = sex_ratio_y, color = above_below))

Nope looks good.

include 2016

mm_data <-  parents %>%
  filter(type %in% c("outplant", "reintro_above"), year <2017) %>%
  select(date, geno_sex, location, tlf, year) %>%

  mutate(jday = as.numeric(format(date, "%j"))) %>% #julian day in this case: days since the first day of the year
  mutate(location = case_when(location == "dry cr" ~ "dry creek",
                              TRUE ~ as.character(location))) %>%
  mutate(location = as.factor(location)) %>%
  mutate(jday_c = scale(jday, scale = F),#center the julian day to help with convergence
         geno_sex = as.factor(geno_sex),
         location = as.factor(location),
         year = as.factor(year),
         group = as.factor(paste(date, location)))


# lets add density
dens <- mm_data %>%
 group_by(jday, location, year) %>%
  summarise(density = n())
## `summarise()` has grouped output by 'jday', 'location'. You can override using
## the `.groups` argument.
mm_data %<>%
  left_join(dens)
## Joining, by = c("location", "year", "jday")
# lets add overall size of release in a year
dens <- mm_data %>%
 group_by( year) %>%
  summarise(annual_n = n())

mm_data %<>%
  left_join(dens)
## Joining, by = "year"
# lets add sex ratio
#build release group sex ratio variable
f <- mm_data %>% 
  filter(geno_sex == "F") %>%
  group_by(location, jday, year) %>% 
  summarise(n_female_rg = n()) 
## `summarise()` has grouped output by 'location', 'jday'. You can override using
## the `.groups` argument.
mm_data %<>%
  left_join(f)
## Joining, by = c("location", "year", "jday")
m <- mm_data %>% 
  filter(geno_sex == "M") %>%
  group_by(location, jday, year) %>% 
  summarise(n_male_rg = n()) 
## `summarise()` has grouped output by 'location', 'jday'. You can override using
## the `.groups` argument.
mm_data %<>%
  left_join(m) %>%
  mutate(sex_ratio_rg = n_male_rg/n_female_rg)
## Joining, by = c("location", "year", "jday")
# maybe the release groups all mix thoroughly spawn together and we should fit sex ratio at the level of year
f <- mm_data %>% 
  filter(geno_sex == "F") %>%
  group_by( year) %>% 
  summarise(n_female_y = n()) 

mm_data %<>%
  left_join(f)
## Joining, by = "year"
m <- mm_data %>% 
  filter(geno_sex == "M") %>%
  group_by( year) %>% 
  summarise(n_male_y = n()) 

mm_data %<>%
  left_join(m) %>%
  mutate(sex_ratio_y = n_male_y/n_female_y)
## Joining, by = "year"
mm_data %<>%
  mutate(sex_ratio_rg_l = log(sex_ratio_rg),
         sex_ratio_y_l = log(sex_ratio_y))
mm_data %<>%
  select(-c(jday, jday_c))

mm_data %<>%
  select(-date)

mm_data %<>%
  drop_na()
mm_beyond_opt <- glmmTMB(tlf ~  geno_sex  +  poly(density,2)+ annual_n + geno_sex*poly(density,2)+poly(sex_ratio_y_l,2)+ poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_rg_l,2) + geno_sex*poly(sex_ratio_y_l,2) + (1|group) + (1|year), data = mm_data, family = nbinom2)

summary(mm_beyond_opt)
##  Family: nbinom2  ( log )
## Formula:          
## tlf ~ geno_sex + poly(density, 2) + annual_n + geno_sex * poly(density,  
##     2) + poly(sex_ratio_y_l, 2) + poly(sex_ratio_rg_l, 2) + geno_sex *  
##     poly(sex_ratio_rg_l, 2) + geno_sex * poly(sex_ratio_y_l,  
##     2) + (1 | group) + (1 | year)
## Data: mm_data
## 
##      AIC      BIC   logLik deviance df.resid 
##  11752.2  11870.1  -5858.1  11716.2     5139 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  group  (Intercept) 0.08832  0.2972  
##  year   (Intercept) 0.03651  0.1911  
## Number of obs: 5157, groups:  group, 98; year, 6
## 
## Dispersion parameter for nbinom2 family (): 0.407 
## 
## Conditional model:
##                                      Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                        -6.167e-01  2.617e-01  -2.357  0.01844 * 
## geno_sexM                          -8.350e-02  6.453e-02  -1.294  0.19566   
## poly(density, 2)1                  -3.322e+00  1.569e+01  -0.212  0.83235   
## poly(density, 2)2                   2.212e+00  5.511e+00   0.401  0.68815   
## annual_n                            2.039e-04  2.250e-04   0.906  0.36481   
## poly(sex_ratio_y_l, 2)1            -3.981e+01  1.305e+01  -3.050  0.00229 **
## poly(sex_ratio_y_l, 2)2            -1.722e+01  9.227e+00  -1.866  0.06204 . 
## poly(sex_ratio_rg_l, 2)1            4.314e-01  6.063e+00   0.071  0.94328   
## poly(sex_ratio_rg_l, 2)2           -2.859e+00  4.508e+00  -0.634  0.52589   
## geno_sexM:poly(density, 2)1         1.025e+01  1.236e+01   0.829  0.40690   
## geno_sexM:poly(density, 2)2         5.820e+00  4.980e+00   1.169  0.24255   
## geno_sexM:poly(sex_ratio_rg_l, 2)1  1.192e+01  8.194e+00   1.454  0.14587   
## geno_sexM:poly(sex_ratio_rg_l, 2)2 -7.539e+00  7.287e+00  -1.034  0.30090   
## geno_sexM:poly(sex_ratio_y_l, 2)1  -2.874e+01  1.151e+01  -2.498  0.01249 * 
## geno_sexM:poly(sex_ratio_y_l, 2)2   5.438e-01  7.639e+00   0.071  0.94325   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Effective Number of Breeders

Calculated Nb using LD method from NeEstimator v2.1. Used GUI, so only logging the parameters used here, not calculating the values.

Parameters:
S+ option (exclude singletons)
CIs: 95% confidence intervals from jacknife re-sampling method
Data: Data for a given year is the assigned offspring of that year

Data Prep

Outplants

2011

Let’s create the input data for NeEstimator. We’ll take advantage of a wrapper function from adegenet to output a genepop file all assigned offspring for each parent year.

# Here we filter the genotype data to get only offspring from 2011 cohor
offspring_of_2011 <- pedigree_meta %>%
  filter(mother_year == 2011 | father_year ==2011) %>%
  filter(mother_type  == "outplant" | father_type == "outplant") %>%
  pull(offspring_sample_id)

gts_off_of_2011 <- full_meta %>%
  filter(sample_id %in% offspring_of_2011) %>%
  select(sample_id, starts_with(c("Ot", "Ogo", "SSa"))) %>% #grab the genotypes
  mutate(across(.cols = everything(), ~na_if(., "0")))

#now we put bth alleles into a single column for each locus
gts_off_of_2011 %<>%
  gather(key = var, value = value, -sample_id) %>%
  mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>% 
  group_by(sample_id, var) %>%
  summarise(combined = paste(value, collapse = "")) %>% 
  spread(key = var, value = combined) 

# add a dummy pop variable for conversion, fix NAs
gts_off_of_2011 %<>%
  add_column(pop = "p") %>%
  relocate(sample_id, pop) %>%
  mutate(across(.cols = everything(), ~str_replace(., "NANA", "000000")))

#write_genepop_zlr(loci = gts_off_of_2011[,3:ncol(gts_off_of_2011)],pops = gts_off_of_2011$pop,ind.ids = gts_off_of_2011$sample_id,folder = "neestimator/outplants/",filename ="genepop_2011.txt",ncode = 3,diploid = T)

Neestimator results are not formatted to be easily parsed by machine. Instead, wrote results manually to a spreadsheet. We’ll import it in the end.

2012

# Here we filter the genotype data to get only offspring from 2011 cohor
offspring_of_2012 <- pedigree_meta %>%
  filter(mother_year == 2012 | father_year ==2012) %>%
  filter(mother_type  == "outplant" | father_type == "outplant") %>%
  pull(offspring_sample_id)

gts_off_of_2012 <- full_meta %>%
  filter(sample_id %in% offspring_of_2012) %>%
  select(sample_id, starts_with(c("Ot", "Ogo", "SSa"))) %>% #grab the genotypes
  mutate(across(.cols = everything(), ~na_if(., "0")))

#now we put bth alleles into a single column for each locus
gts_off_of_2012 %<>%
  gather(key = var, value = value, -sample_id) %>%
  mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>% 
  group_by(sample_id, var) %>%
  summarise(combined = paste(value, collapse = "")) %>% 
  spread(key = var, value = combined) 

# add a dummy pop variable for conversion, fix NAs
gts_off_of_2012 %<>%
  add_column(pop = "p") %>%
  relocate(sample_id, pop) %>%
  mutate(across(.cols = everything(), ~str_replace(., "NANA", "000000")))

#write_genepop_zlr(loci = gts_off_of_2012[,3:ncol(gts_off_of_2012)],pops = gts_off_of_2012$pop,ind.ids = gts_off_of_2012$sample_id,folder = "neestimator/outplants/",filename ="genepop_2012.txt",ncode = 3,diploid = T)

2013

# Here we filter the genotype data to get only offspring from 2011 cohor
offspring_of_2013 <- pedigree_meta %>%
  filter(mother_year == 2013 | father_year ==2013) %>%
  filter(mother_type  == "outplant" | father_type == "outplant") %>%
  pull(offspring_sample_id)

gts_off_of_2013 <- full_meta %>%
  filter(sample_id %in% offspring_of_2013) %>%
  select(sample_id, starts_with(c("Ot", "Ogo", "SSa"))) %>% #grab the genotypes
  mutate(across(.cols = everything(), ~na_if(., "0")))

#now we put bth alleles into a single column for each locus
gts_off_of_2013 %<>%
  gather(key = var, value = value, -sample_id) %>%
  mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>% 
  group_by(sample_id, var) %>%
  summarise(combined = paste(value, collapse = "")) %>% 
  spread(key = var, value = combined) 

# add a dummy pop variable for conversion, fix NAs
gts_off_of_2013 %<>%
  add_column(pop = "p") %>%
  relocate(sample_id, pop) %>%
  mutate(across(.cols = everything(), ~str_replace(., "NANA", "000000")))

#write_genepop_zlr(loci = gts_off_of_2013[,3:ncol(gts_off_of_2013)],pops = gts_off_of_2013$pop,ind.ids = gts_off_of_2013$sample_id,folder = "neestimator/outplants/",filename ="genepop_2013.txt",ncode = 3,diploid = T)
# Here we filter the genotype data to get only offspring from 2011 cohor
offspring_of_2014 <- pedigree_meta %>%
  filter(mother_year == 2014 | father_year ==2014) %>%
  filter(mother_type  == "outplant" | father_type == "outplant") %>%
  pull(offspring_sample_id)

gts_off_of_2014 <- full_meta %>%
  filter(sample_id %in% offspring_of_2014) %>%
  select(sample_id, starts_with(c("Ot", "Ogo", "SSa"))) %>% #grab the genotypes
  mutate(across(.cols = everything(), ~na_if(., "0")))

#now we put bth alleles into a single column for each locus
gts_off_of_2014 %<>%
  gather(key = var, value = value, -sample_id) %>%
  mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>% 
  group_by(sample_id, var) %>%
  summarise(combined = paste(value, collapse = "")) %>% 
  spread(key = var, value = combined) 

# add a dummy pop variable for conversion, fix NAs
gts_off_of_2014 %<>%
  add_column(pop = "p") %>%
  relocate(sample_id, pop) %>%
  mutate(across(.cols = everything(), ~str_replace(., "NANA", "000000")))

#write_genepop_zlr(loci = gts_off_of_2014[,3:ncol(gts_off_of_2014)],pops = gts_off_of_2014$pop,ind.ids = gts_off_of_2014$sample_id,folder = "neestimator/outplants/",filename ="genepop_2014.txt",ncode = 3,diploid = T)

2015

# Here we filter the genotype data to get only offspring from 2011 cohor
offspring_of_2015 <- pedigree_meta %>%
  filter(mother_year == 2015 | father_year ==2015) %>%
  filter(mother_type  == "outplant" | father_type == "outplant") %>%
  pull(offspring_sample_id)

gts_off_of_2015 <- full_meta %>%
  filter(sample_id %in% offspring_of_2015) %>%
  select(sample_id, starts_with(c("Ot", "Ogo", "SSa"))) %>% #grab the genotypes
  mutate(across(.cols = everything(), ~na_if(., "0")))

#now we put bth alleles into a single column for each locus
gts_off_of_2015 %<>%
  gather(key = var, value = value, -sample_id) %>%
  mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>% 
  group_by(sample_id, var) %>%
  summarise(combined = paste(value, collapse = "")) %>% 
  spread(key = var, value = combined) 

# add a dummy pop variable for conversion, fix NAs
gts_off_of_2015 %<>%
  add_column(pop = "p") %>%
  relocate(sample_id, pop) %>%
  mutate(across(.cols = everything(), ~str_replace(., "NANA", "000000")))

#write_genepop_zlr(loci = gts_off_of_2015[,3:ncol(gts_off_of_2015)],pops = gts_off_of_2015$pop,ind.ids = gts_off_of_2015$sample_id,folder = "neestimator/outplants/",filename ="genepop_2015.txt",ncode = 3,diploid = T)

Reintros

Reintro Above (2015)

# Here we filter the genotype data to get only offspring from 2011 cohor
offspring_of_2015 <- pedigree_meta %>%
  filter(mother_year == 2015 | father_year ==2015) %>%
  filter(mother_type  == "reintro_above" | father_type == "reintro_above") %>%
  pull(offspring_sample_id)

gts_off_of_2015 <- full_meta %>%
  filter(sample_id %in% offspring_of_2015) %>%
  select(sample_id, starts_with(c("Ot", "Ogo", "SSa"))) %>% #grab the genotypes
  mutate(across(.cols = everything(), ~na_if(., "0")))

#now we put bth alleles into a single column for each locus
gts_off_of_2015 %<>%
  gather(key = var, value = value, -sample_id) %>%
  mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>% 
  group_by(sample_id, var) %>%
  summarise(combined = paste(value, collapse = "")) %>% 
  spread(key = var, value = combined) 

# add a dummy pop variable for conversion, fix NAs
gts_off_of_2015 %<>%
  add_column(pop = "p") %>%
  relocate(sample_id, pop) %>%
  mutate(across(.cols = everything(), ~str_replace(., "NANA", "000000")))

#write_genepop_zlr(loci = gts_off_of_2015[,3:ncol(gts_off_of_2015)],pops = gts_off_of_2015$pop,ind.ids = gts_off_of_2015$sample_id,folder = "neestimator/reintros/",filename ="genepop_2015_above.txt",ncode = 3,diploid = T)

Reintro Below (2013)

# Here we filter the genotype data to get only offspring from 2011 cohor
offspring_of_2013 <- pedigree_meta %>%
  filter(mother_year == 2013 | father_year ==2013) %>%
  filter(mother_type  == "reintro" | father_type == "reintro") %>%
  pull(offspring_sample_id)

gts_off_of_2013 <- full_meta %>%
  filter(sample_id %in% offspring_of_2013) %>%
  select(sample_id, starts_with(c("Ot", "Ogo", "SSa"))) %>% #grab the genotypes
  mutate(across(.cols = everything(), ~na_if(., "0")))

#now we put bth alleles into a single column for each locus
gts_off_of_2013 %<>%
  gather(key = var, value = value, -sample_id) %>%
  mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>% 
  group_by(sample_id, var) %>%
  summarise(combined = paste(value, collapse = "")) %>% 
  spread(key = var, value = combined) 

# add a dummy pop variable for conversion, fix NAs
gts_off_of_2013 %<>%
  add_column(pop = "p") %>%
  relocate(sample_id, pop) %>%
  mutate(across(.cols = everything(), ~str_replace(., "NANA", "000000")))

#write_genepop_zlr(loci = gts_off_of_2013[,3:ncol(gts_off_of_2013)],pops = gts_off_of_2013$pop,ind.ids = gts_off_of_2013$sample_id,folder = "neestimator/reintros/",filename ="genepop_2013.txt",ncode = 3,diploid = T)

Reintro Below (2014)

# Here we filter the genotype data to get only offspring from 2011 cohor
offspring_of_2014 <- pedigree_meta %>%
  filter(mother_year == 2014 | father_year ==2014) %>%
  filter(mother_type  == "reintro" | father_type == "reintro") %>%
  pull(offspring_sample_id)

gts_off_of_2014 <- full_meta %>%
  filter(sample_id %in% offspring_of_2014) %>%
  select(sample_id, starts_with(c("Ot", "Ogo", "SSa"))) %>% #grab the genotypes
  mutate(across(.cols = everything(), ~na_if(., "0")))

#now we put bth alleles into a single column for each locus
gts_off_of_2014 %<>%
  gather(key = var, value = value, -sample_id) %>%
  mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>% 
  group_by(sample_id, var) %>%
  summarise(combined = paste(value, collapse = "")) %>% 
  spread(key = var, value = combined) 

# add a dummy pop variable for conversion, fix NAs
gts_off_of_2014 %<>%
  add_column(pop = "p") %>%
  relocate(sample_id, pop) %>%
  mutate(across(.cols = everything(), ~str_replace(., "NANA", "000000")))

#write_genepop_zlr(loci = gts_off_of_2014[,3:ncol(gts_off_of_2014)],pops = gts_off_of_2014$pop,ind.ids = gts_off_of_2014$sample_id,folder = "neestimator/reintros/",filename ="genepop_2014.txt",ncode = 3,diploid = T)

Reintro Below (2015)

# Here we filter the genotype data to get only offspring from 2011 cohor
offspring_of_2015 <- pedigree_meta %>%
  filter(mother_year == 2015 | father_year ==2015) %>%
  filter(mother_type  == "reintro" | father_type == "reintro") %>%
  pull(offspring_sample_id)

gts_off_of_2015 <- full_meta %>%
  filter(sample_id %in% offspring_of_2015) %>%
  select(sample_id, starts_with(c("Ot", "Ogo", "SSa"))) %>% #grab the genotypes
  mutate(across(.cols = everything(), ~na_if(., "0")))

#now we put bth alleles into a single column for each locus
gts_off_of_2015 %<>%
  gather(key = var, value = value, -sample_id) %>%
  mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>% 
  group_by(sample_id, var) %>%
  summarise(combined = paste(value, collapse = "")) %>% 
  spread(key = var, value = combined) 

# add a dummy pop variable for conversion, fix NAs
gts_off_of_2015 %<>%
  add_column(pop = "p") %>%
  relocate(sample_id, pop) %>%
  mutate(across(.cols = everything(), ~str_replace(., "NANA", "000000")))

#write_genepop_zlr(loci = gts_off_of_2015[,3:ncol(gts_off_of_2015)],pops = gts_off_of_2015$pop,ind.ids = gts_off_of_2015$sample_id,folder = "neestimator/reintros/",filename ="genepop_2015.txt",ncode = 3,diploid = T)

Combined Above

Since the outplants and reintros above the dam seem to be freely spawning together. We should really estimate Nb for these groups together.

Results

nb <- readxl::read_xlsx("neestimator/Nb_results.xlsx", sheet = 1)


#let's add the counts of number of candidate parents, number of successful parents estimated from the pedigree

#get counts total candidates

nb <- meta_data %>%
  mutate(year = as.numeric(year)) %>%
  count(year, type) %>%
  right_join(nb) %>%
  rename(n_candidate_parents = n)



# get basic counts of successful parents (candidate parents in row with at least one offspring)

# kable(pedigree_meta %>%
#   mutate(parent_year = coalesce(mother_year, father_year)) %>%
#   select( parent_year, father_id = father, mother_id = mother, father_type, mother_type) %>%
#   pivot_longer(cols = c(father_id, father_type, mother_id, mother_type), names_to = c("parent_sex", ".value"), names_sep = "_") %>%
#   filter(id != "none") %>%
#   distinct(id, .keep_all = TRUE) %>%
#   count(parent_year, type), align = "c", caption = "number of parents with at least one assigned offspring, parents considered individually") %>%
#   kable_classic(full_width = F, html_font = "Cambria")

#also add these results to the results table
nb <- pedigree_meta %>%
  mutate(parent_year = coalesce(mother_year, father_year)) %>%
  select( parent_year, father_id = father, mother_id = mother, father_type, mother_type) %>%
  pivot_longer(cols = c(father_id, father_type, mother_id, mother_type), names_to = c("parent_sex", ".value"), names_sep = "_") %>%
  filter(id != "none") %>%
  distinct(id, .keep_all = TRUE) %>%
  count(parent_year, type) %>%
  rename(year = parent_year) %>%
  mutate(year = as.numeric(year)) %>%
  right_join(nb) %>%
  rename(n_successful_parents = n)

  

# now present nb

nb %<>%
  mutate(nb_n_ratio = Nb/n_successful_parents) %>%
  relocate(year, type, Nb, ci_lower, ci_higher)

kable(nb, align = "c", caption = "Nb results table") %>% kable_classic(full_width = F, html_font = "Cambria") %>%
  footnote(number = c("n_succesful_parents: number of parents with at least one offspring in the final pedigree", "n_candidate_parents: total number of parents included as candidates in the pedigree analysis", "nb_n_ratio: (estimated Nb)/(n_successful_parents)"))
Nb results table
year type Nb ci_lower ci_higher n_successful_parents n_candidate_parents n_offspring nb_n_ratio
2011 outplant 57.1 42.4 80.6 50 149 94 1.142000
2012 outplant 113.0 91.1 143.5 102 258 174 1.107843
2013 outplant 317.5 251.7 418.7 248 1125 242 1.280242
2013 reintro 162.0 103.2 323.8 97 554 91 1.670103
2014 outplant 145.4 89.7 312.1 104 861 87 1.398077
2014 reintro 230.6 111.9 4527.3 60 754 50 3.843333
2015 outplant 315.3 266.2 379.2 308 1042 498 1.023701
2015 reintro 80.1 56.8 123.5 49 148 83 1.634694
2015 reintro_above 217.1 183.7 259.9 167 431 396 1.300000
2015 combined_above 370.4 321.6 430.2 NA NA 717 NA
1 n_succesful_parents: number of parents with at least one offspring in the final pedigree
2 n_candidate_parents: total number of parents included as candidates in the pedigree analysis
3 nb_n_ratio: (estimated Nb)/(n_successful_parents)

Discussion

These are some stray thoughts I have, don’t feel the need to dive into the weeds here if pressed for time.

Issues
Note that there are two issues that are a little confusing here and may warrant resolving:

Which Cohorts: Outplant and Reintros OR Above and Below?
We include all offspring that have at least one assigned offspring to a parent cohort as part of that parent cohort’s dataset used to estimate Nb. There are two parent cohorts under consideration in the analysis: reintros, and outplants (3 if you also include rentros above Detroit). However there are only two biological parent cohorts, above Detroit and below Big Cliff. Carcass samples appear in the pedigree as parents with both outplants (above) and reintros (below), so they can be part of either cohort, either because the carcass was sampled above the dam (n = 15) or because the carcass was sampled below the dam (n = 427), and spawned below the dam, or spawned above the dam but passed over/through the dam after spawning.
As a consequence, our understanding of the census size of successful spawners in a cohort (as estimated by the number of parents in a given year/group with assigned offspring) is biased downwards, because any carcass samples from fish that spawned successfully as a member of this cohort are not counted. Since we can’t say for sure if a carcass was part of the above or below dam cohort unless it’s spawning partner is assigned (e.g it is part of a parent-offspring trio with a non-carcass sample parent), we can’t simply add all carcass parentages to the total. However, we can count a carcass sample as part of the above or below dam cohort if it is a member of a trio. In the results below I present both numbers, we can choose which to include in the report down the road.

Single Parents
A second way to improve our count of prents in a cohort would be to infer a second parent for single parent matches. If a offspring is assigned to a single parent, we can assume there was a second parent and the assignment was not made because (i) the parent was not included in our set of candidate parents because it was not sampled, (ii) the parent was sampled, but not included in our set of candidate parents because ithad insufficient data to be included, or (iii) the second parent was in the datset but incorrectly excluded due to assignment criteria.

In the case of (i) or (ii) we could simply add a second parent to the counts and imporve our estimate of the number of parents in a cohort. However the possibility of (iii) presents a problem. We might double count an individual, if it is excluded from some parentages, but not others. Also, we may over estimate the number of parents if an inferred excluded parent contributed to multiple parentages. I do not attempt to infer the number of parents accounting for single parentage because of these multiple issues.

Nb bigger than N_successful / synthesis of issues above

Throughout the Nb results and discussion in the text we consider why Nb is sometimes larger or similar to the number of parents that produced at least one offspring in the pedigree. It should probably always be lower, substantially so given the variance year to year in population size, skewed sex ratios and other things going on. So what’s going on here?

Let’s ignore the single parent issue for now.

# get counts of different types of parent pairs
kable(pedigree_meta %>%
  mutate(parent_year = coalesce(mother_year, father_year)) %>%
  select(parent_type, parent_year, father, mother) %>%
  pivot_longer(cols = c(father, mother), names_to = "parent_sex", values_to = "parent_id" ) %>%
  filter(parent_id != "none") %>%
  distinct(parent_id, .keep_all = TRUE) %>% 
  count(parent_year, parent_type), align = "c", caption = "number of parents with at least one assigned offspring, types for both parents presented if trio, see note about error in this table") %>%
  kable_classic(full_width = F, html_font = "Cambria")
number of parents with at least one assigned offspring, types for both parents presented if trio, see note about error in this table
parent_year parent_type n
2011 carcass 40
2011 outplant 50
2012 carcass 19
2012 outplant 102
2013 carcass 2
2013 carcass/reintro 2
2013 outplant 248
2013 reintro 96
2014 carcass 2
2014 outplant 104
2014 reintro 60
2015 carcass 2
2015 carcass_above 1
2015 carcass_above/outplant 1
2015 outplant 239
2015 outplant/reintro_above 125
2015 reintro 48
2015 reintro_above 88
2015 reintro_above/carcass_above 2
2015 reintro_above/outplant 22
2015 reintro/carcass 2
2016 carcass 5
2016 outplant 695
2016 reintro 122
2017 outplant 16
2017 reintro 19
# note there's a problem with this table, we filter duplicates, so when a single individual has spawning partners of multiple types, the count by type is only counted once. For example, a 2015 outplant individual that spawns with both other outplants and reintros is either included in the outplants total or the outplant/reintro total depdending on which is the first (by row order) in the pedigree dataframe. Figuring out the right way to add these up will take some time, so skipping for now unless we really want to include these numbers in the report.

Consider the table above, it suggests which number to present as the number of parents from the pedigree is not straightforward. In the n_successful parents column in the Nb results table we consider each parent individually, in the table above we consider parent types for both parents in a trio, which is the more informative value to present, because it tells us about the actual cohort of individuals that spawned together. For example, for the 2011 outplant parent cohort, should the number be 51 to reflect only outplants, or 52 to reflect that we know that one carcass individual produced offspring in the pedigree with one outplants?

Now that we’ve established this, let’s discuss why in some groups (all reintro below), the estimated Nb is greater than the number of successful parents in the pedigree.

Negative assortative mating?
Maybe, but doubtful.

Our assignment criteria are too stringent given the amount of mis-genotypting and undersampling of candidate parents?
This might be the case, but we can’t do much about this right now. If true, it is a bit troubling that the problem is more severe in one parent type (reintros below) than others.

The cohort is artificially small because we split by type rather than spawning location?
YES, we can positively assess this. In 2015, there are at least 151 parents above the dam that spawned with a different type than themselve (e.g. reintro above/outplant). To accurately reflect the number of parents in the above dam cohort with at least a single offspring in the pedigree we need to use the table above. Including all parents we know to contribute above the dam (including carcasses) the inferred number of succesful parents should be 478 (reintros above + outplants + carcasses known to have spawned above the dam). I do some extra work to explore this in the section “2015 Combined” below.

Note there’s a problem with the second table. We filter duplicate parents from the pedigree to get the counts (i.e. count each successful parent once, not once per each offspring it produces), so when a single individual parent has spawning partners of multiple types, the count by type is only counted once. For example, a 2015 outplant individual that spawns with both other outplants and reintros is either included in the outplants total or the outplant/reintro total depdending on which is the first (by row order) in the pedigree dataframe. Figuring out the right way to add these up will take some time, in the interest of time I’m not going to fix this problem, unless we really want to include these numbers in the report. For now, we should just be aware that the counts in the second table are MINIMUMS, not the actual number, but they highlight the question about what is the best way to think about cohort size: by parent type, or by above/below the dams

2015 Combined

To further explore the issues described above. I also ran NeEstimator for all parents above Detroit in 2015 (excluding carcasses sampled below the dam, but assigned to have spawned with a second parent above the dam).

# Here we filter the genotype data to get only offspring from 2011 cohor
offspring_of_2015 <- pedigree_meta %>%
  filter(mother_year == 2015 | father_year ==2015) %>%
  filter(mother_type  == "reintro_above" | father_type == "reintro_above" | mother_type == "outplant" | father_type == "outplant" | mother_type == "carcass_above" | father_type == "carcass_above") %>%
  pull(offspring_sample_id)

gts_off_of_2015 <- full_meta %>%
  filter(sample_id %in% offspring_of_2015) %>%
  select(sample_id, starts_with(c("Ot", "Ogo", "SSa"))) %>% #grab the genotypes
  mutate(across(.cols = everything(), ~na_if(., "0")))

#now we put bth alleles into a single column for each locus
gts_off_of_2015 %<>%
  gather(key = var, value = value, -sample_id) %>%
  mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>% 
  group_by(sample_id, var) %>%
  summarise(combined = paste(value, collapse = "")) %>% 
  spread(key = var, value = combined) 

# add a dummy pop variable for conversion, fix NAs
gts_off_of_2015 %<>%
  add_column(pop = "p") %>%
  relocate(sample_id, pop) %>%
  mutate(across(.cols = everything(), ~str_replace(., "NANA", "000000")))

#write_genepop_zlr(loci = gts_off_of_2015[,3:ncol(gts_off_of_2015)],pops = gts_off_of_2015$pop,ind.ids = gts_off_of_2015$sample_id,folder = "neestimator/2015_above/",filename ="genepop_2015.txt",ncode = 3,diploid = T)

For the 2015 above Detroit parent cohort (ignoring carcasses to be consistent with the table as presented earlier), the number of offspring in the dataset was 717, Nb was 370.4 (321.6 - 430.2). Here we see that when considered all of the successful parents above Detroit together, the Nb results are not so problematic. The ratio of Nb to number of successful parents is a more reasonable ~78% (compared to 95% for outplants and 92% for reintros).

Appendix

We already have allele frequencies and HWE from many places, but let’s be consistent and calculate using GENEPOP. This requires conversion to GENEPOP format.

offspring2020 <- pedigree_meta %>%
  filter(offspring_year == 2020) %>%
  pull(offspring_sample_id)

gts_of_2020 <- full_meta %>%
  filter(sample_id %in% offspring2020) %>%
  select(sample_id, starts_with(c("Ot", "Ogo", "SSa"))) %>% #grab the genotypes
  mutate(across(.cols = everything(), ~na_if(., "0")))

#now we put bth alleles into a single column for each locus
gts_of_2020 %<>%
  gather(key = var, value = value, -sample_id) %>%
  mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>% 
  group_by(sample_id, var) %>%
  summarise(combined = paste(value, collapse = "")) %>% 
  spread(key = var, value = combined) 
## `summarise()` has grouped output by 'sample_id'. You can override using the
## `.groups` argument.
# add a dummy pop variable for conversion, fix NAs
gts_of_2020 %<>%
  add_column(pop = "p") %>%
  relocate(sample_id, pop) %>%
  mutate(across(.cols = everything(), ~str_replace(., "NANA", "000000")))

#write_genepop_zlr(loci = gts_of_2020[,3:ncol(gts_of_2020)],pops = gts_of_2020$pop,ind.ids = gts_of_2020$sample_id,folder = "analysis/genepop/",filename ="genepop_2020.txt",ncode = 3,diploid = T)
offspring2017 <- pedigree_meta %>%
  filter(offspring_year == 2017) %>%
  pull(offspring_sample_id)

gts_of_2017 <- full_meta %>%
  filter(sample_id %in% offspring2017) %>%
  select(sample_id, starts_with(c("Ot", "Ogo", "SSa"))) %>% #grab the genotypes
  mutate(across(.cols = everything(), ~na_if(., "0")))

#now we put bth alleles into a single column for each locus
gts_of_2017 %<>%
  gather(key = var, value = value, -sample_id) %>%
  mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>% 
  group_by(sample_id, var) %>%
  summarise(combined = paste(value, collapse = "")) %>% 
  spread(key = var, value = combined) 

# add a dummy pop variable for conversion, fix NAs
gts_of_2017 %<>%
  add_column(pop = "p") %>%
  relocate(sample_id, pop) %>%
  mutate(across(.cols = everything(), ~str_replace(., "NANA", "000000")))

#write_genepop_zlr(loci = gts_of_2017[,3:ncol(gts_of_2017)],pops = gts_of_2017$pop,ind.ids = gts_of_2017$sample_id,folder = "analysis/genepop/",filename ="genepop_2017.txt",ncode = 3,diploid = T)